In [None]:
%load_ext autoreload
%autoreload 2
import sys, os
sys.path.insert(1, '../')
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patheffects as pe
import seaborn as sns
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import pdb
import xgboost as xgb
from scipy.stats import norm
from scipy.stats import multivariate_normal
from scipy.stats import bernoulli

## Implementation of classical inference, PPI with data splitting, and cross-prediction

In [None]:
def trial(X, Y, n, alpha, q):
    # one trial; randomly splits data into labeled and unlabeled data and then runs baselines
    
    X_labeled, X_unlabeled, Y_labeled, Y_unlabeled = train_test_split(X, Y, train_size=n)

    classical_interval = classical_quantile_interval(Y_labeled, alpha, q) # uses only labeled data
    
    pp_interval = pp_quantile_interval(X_labeled, X_unlabeled, Y_labeled, alpha, q, int(0.5*n)) # runs PPI after data splitting
    
    cpp_interval = cross_prediction_quantile_interval(X_labeled, X_unlabeled, Y_labeled, alpha, q) # cross-prediction

    return classical_interval, pp_interval, cpp_interval

In [None]:
def classical_quantile_interval(Y, alpha, q):
    # classical interval
    
    n = len(Y)
    l = np.floor(q*n - norm.ppf(1-alpha/2)*np.sqrt(q*(1-q)*n))
    u = np.ceil(q*n + norm.ppf(1-alpha/2)*np.sqrt(q*(1-q)*n))
    sorted_Y = np.sort(Y)
    
    return [sorted_Y[int(l)], sorted_Y[int(u)]]

In [None]:
def pp_quantile_interval(X_labeled, X_unlabeled, Y_labeled, alpha, q, n_tr):
    # performs data splitting and then runs PPI
    
    n = X_labeled.shape[0]
    N = X_unlabeled.shape[0]
    
    X_train, X_val, Y_train, Y_val = train_test_split(X_labeled, Y_labeled, train_size=n_tr)
    
    X_train1, X_train2, y_train1, y_train2 = train_test_split(X_train, Y_train, test_size=0.1)
    dtrain = xgb.DMatrix(X_train1, label=y_train1)
    dtest = xgb.DMatrix(X_train2, label=y_train2)
    param = {'max_depth': 7, 'eta': 0.1, 'objective': 'reg:squarederror', 'eval_metric': ['error', 'mae']}
    evallist = [(dtest, 'eval'), (dtrain, 'train')]
    num_round = 500
    tree = xgb.train(param, dtrain, num_round, evallist, verbose_eval=False)

    
    Yhat_unlabeled = tree.predict(xgb.DMatrix(X_unlabeled))
    Yhat_val = tree.predict(xgb.DMatrix(X_val))
    
    # form grid
    theta_L = np.min(Yhat_unlabeled)
    theta_U = np.max(Yhat_unlabeled)
    
    theta_grid = np.linspace(theta_L, theta_U, 1000)
    
    ci_l = theta_L - 1
    
    for theta in theta_grid:
        
        Fhat_unlabeled = 1/N * np.sum(Yhat_unlabeled <= theta)
        Deltahat = 1/(n-n_tr) * (np.sum(Y_val <= theta) - np.sum(Yhat_val <= theta))
        
        sigma_hat = np.sqrt(np.var(Yhat_unlabeled <= theta)/N + np.var(1*(Y_val <= theta) - (Yhat_val <= theta))/(n-n_tr))
        width = norm.ppf(1-alpha/2) * sigma_hat
        
        if np.abs(Fhat_unlabeled + Deltahat - q) <= width:
            if ci_l < theta_L:
                ci_l = theta
            ci_u = theta

    return [ci_l, ci_u]

In [None]:
def cross_prediction_quantile_interval(X_labeled, X_unlabeled, Y_labeled, alpha, q, K = 10):
    # cross-prediction
    
    n = X_labeled.shape[0]
    N = X_unlabeled.shape[0]
    
    fold_n = int(n/K)
    
    Yhat_labeled = np.zeros(n)
    Yhat_unlabeled = np.zeros(N*K)
    
    for j in range(K):
    
        X_val = X_labeled[j*fold_n:(j+1)*fold_n,:]
        Y_val = Y_labeled[j*fold_n:(j+1)*fold_n]
        train_ind = np.delete(range(n),range(j*fold_n,(j+1)*fold_n))
        X_train = X_labeled[train_ind,:]
        Y_train = Y_labeled[train_ind]

        # use train data to train a tree
        X_train1, X_train2, y_train1, y_train2 = train_test_split(X_train, Y_train, test_size=0.1)
        dtrain = xgb.DMatrix(X_train1, label=y_train1)
        dtest = xgb.DMatrix(X_train2, label=y_train2)
        param = {'max_depth': 7, 'eta': 0.1, 'objective': 'reg:squarederror', 'eval_metric': ['error', 'mae']}
        evallist = [(dtest, 'eval'), (dtrain, 'train')]
        num_round = 500
        tree = xgb.train(param, dtrain, num_round, evallist, verbose_eval=False)


        Yhat_unlabeled[N*j:N*(j+1)] = tree.predict(xgb.DMatrix(X_unlabeled))
        Yhat_labeled[j*fold_n:(j+1)*fold_n] = tree.predict(xgb.DMatrix(X_val))
    
    
    
    Yhat_unlab_bs, Yhat_lab_bs, Y_paired_bs = bootstrap_predictions(X_labeled, X_unlabeled, Y_labeled, n-fold_n)
    
    # form grid
    theta_L = np.min(Yhat_unlabeled)
    theta_U = np.max(Yhat_unlabeled)
    
    theta_grid = np.linspace(theta_L, theta_U, 1000)
    
    ci_l = theta_L - 1
    
    for theta in theta_grid:
        
        Fhat_unlabeled = np.mean(Yhat_unlabeled <= theta)
        Deltahat = 1/n * (np.sum(Y_labeled <= theta) - np.sum(Yhat_labeled <= theta))
        
        var_unlabeled = np.var(Yhat_unlab_bs <= theta)
        var_labeled = np.var(1*(Y_paired_bs <= theta) - (Yhat_lab_bs <= theta))
        
        var_hat = var_unlabeled/N + var_labeled/n
        width = norm.ppf(1-alpha/2) * np.sqrt(var_hat)
        
        if np.abs(Fhat_unlabeled + Deltahat - q) <= width:
            if ci_l < theta_L:
                ci_l = theta
            ci_u = theta

    return [ci_l, ci_u]

In [None]:
def bootstrap_predictions(X_labeled, X_unlabeled, Y_labeled, train_n, B = 30):
    # estimates the asymptotic variance of cross-prediction
    
    n = X_labeled.shape[0]
    N = X_unlabeled.shape[0]

    Yhat_labeled = np.zeros(int((n-train_n)*B))
    Y_paired = np.zeros(int((n-train_n)*B))
    Yhat_unlabeled = np.zeros(N)
    
    
    for j in range(B):
        
        train_ind = np.random.choice(range(n),train_n)
        X_train = X_labeled[train_ind,:]
        Y_train = Y_labeled[train_ind]
        
        # use train data to train a tree
        X_train1, X_train2, y_train1, y_train2 = train_test_split(X_train, Y_train, test_size=0.1)
        dtrain = xgb.DMatrix(X_train1, label=y_train1)
        dtest = xgb.DMatrix(X_train2, label=y_train2)
        param = {'max_depth': 7, 'eta': 0.1, 'objective': 'reg:squarederror', 'eval_metric': ['error', 'mae']}
        evallist = [(dtest, 'eval'), (dtrain, 'train')]
        num_round = 500
        tree = xgb.train(param, dtrain, num_round, evallist, verbose_eval=False)

        Yhat_unlabeled += (tree.predict(xgb.DMatrix(X_unlabeled)))/B # simulates average prediction
        
        other_inds = np.random.choice(np.delete(range(n), train_ind), n-train_n, replace = False)
        Yhat_labeled[j*(n-train_n):(j+1)*(n-train_n)] = tree.predict(xgb.DMatrix(X_labeled[other_inds, :]))
        Y_paired[j*(n-train_n):(j+1)*(n-train_n)] = Y_labeled[other_inds]

    
    return Yhat_unlabeled, Yhat_labeled, Y_paired

## Main cell: generate data and form intervals

In [None]:
N = 10000 # size of unlabeled data
ns = np.array([100, 200, 300, 400, 500, 600, 700, 800, 900, 1000]) # size of labeled data
num_trials = 100
alpha = 0.1

# parameters of data generating process:
d = 2
Rsqs = [0, 0.5, 1]
var_y = 4
mu = 4
beta = np.zeros(d)

df_list = []

q = 0.75 # quantile of interest
        
# store results
columns = ["lb","ub","coverage","estimator","n", r'$R^2$']


for Rsq in Rsqs:
        
    # for saving data
    filename = "quantile_results/" + f"Rsq_{Rsq}".replace(".", "_") + ".csv"
    if os.path.exists(filename):
        continue

    
    # compute target
    beta[:] = np.sqrt(Rsq * var_y / d)
    large_N = 1000000

        
    X_large = multivariate_normal.rvs(mean=np.zeros(d), cov=np.eye(d), size=large_N)
    y_large = X_large @ beta + np.sqrt(var_y * (1-Rsq))*np.random.randn(large_N) + mu
    theta_true = np.quantile(y_large, q)
    

    results = []
    for j in tqdm(range(ns.shape[0])):
        for i in range(num_trials):
            n = ns[j]

            # generate data
            X = multivariate_normal.rvs(mean=np.zeros(d), cov=np.eye(d), size=(n+N)) # feature matrix
            y = X @ beta + np.sqrt(var_y * (1-Rsq)) * np.random.randn(n+N) + mu # outcomes

            ci, ppi, cppi = trial(X, y, n, alpha, q)
            
            temp_df = pd.DataFrame(np.zeros((3,len(columns))), columns=columns)
            temp_df.loc[0] = cppi[0], cppi[1], (cppi[0] <= theta_true) & (theta_true <= cppi[1]), "cross-prediction", n, Rsq
            temp_df.loc[1] = ci[0], ci[1], (ci[0] <= theta_true) & (theta_true <= ci[1]), "classical", n, Rsq
            temp_df.loc[2] = ppi[0], ppi[1], (ppi[0] <= theta_true) & (theta_true <= ppi[1]), "PPI", n, Rsq
            results += [temp_df]

    df = pd.concat(results)
    df["width"] = df["ub"] - df["lb"]
    df_list += [df]
    os.makedirs('./quantile_results/', exist_ok=True)
    
    # save data
    df.to_csv(filename)
        
final_df = pd.concat(df_list, ignore_index=True)

























100%|████████████████████████████████████████| 7/7 [14:32:42<00:00, 7480.42s/it]




























100%|████████████████████████████████████████| 7/7 [11:45:37<00:00, 6048.19s/it]


## Plot results

In [None]:
Rsqs = [0, 0.5, 1]
alpha = 0.1

# plots coverage and width as function of n and beta
col = np.array([sns.color_palette("Set2")[1], sns.color_palette("Set2")[2], sns.color_palette("Set2")[0]])
sns.set_theme(font_scale=1.4, style='white', palette=col, rc={'lines.linewidth': 3})
fig, axs = plt.subplots(nrows=3, ncols=2, figsize=(10,10))
sns.lineplot(ax=axs[0,0],data=final_df[(final_df[r'$R^2$'] == 0)], x='n', y='coverage', hue='estimator', alpha=0.9, errorbar=None)
sns.lineplot(ax=axs[0,1],data=final_df[ (final_df[r'$R^2$'] == 0)], x='n', y='width', hue='estimator', alpha=0.9)
sns.lineplot(ax=axs[1,0],data=final_df[(final_df[r'$R^2$'] == 0.5)], x='n', y='coverage', hue='estimator', alpha=0.9, errorbar=None)
sns.lineplot(ax=axs[1,1],data=final_df[ (final_df[r'$R^2$'] == 0.5)], x='n', y='width', hue='estimator', alpha=0.9)
sns.lineplot(ax=axs[2,0],data=final_df[(final_df[r'$R^2$'] == 1)], x='n', y='coverage', hue='estimator', alpha=0.9, errorbar=None)
sns.lineplot(ax=axs[2,1],data=final_df[ (final_df[r'$R^2$'] == 1)], x='n', y='width', hue='estimator', alpha=0.9)


grid = plt.GridSpec(3, 1)
for i in range(3):
    # create fake subplot just to title set of subplots
    fake = fig.add_subplot(grid[i])
    # '\n' is important
    fake.set_title(f'R² = {Rsqs[i]}\n', size=18)
    fake.set_axis_off()
    
for i in range(axs.shape[0]):
    axs[i,0].axhline(1-alpha, color="#888888", linestyle='dashed', zorder=1, alpha=0.9)
    for j in range(axs.shape[1]):
        if (i == 0) & (j == 1):
            # remove the legend title
            handles, labels = axs[i,j].get_legend_handles_labels()
            axs[i,j].legend(handles=handles, labels=labels)
        else:
            # remove the legend
            axs[i,j].get_legend().remove()
            axs[i,0].set_ylim([0.5,1])

sns.despine(top=True, right=True)
plt.tight_layout()

# save plot
plt.savefig('./quantile_results/quantile_comparison.pdf')
plt.show()

In [None]:
# for reading data after it has been saved
datadir = './quantile_results/'
filenames = os.listdir(datadir)
data = [ pd.read_csv(os.path.join(datadir, fn)) for fn in filenames if 'Rsq' in fn ]
final_df = pd.concat(data, axis=0, ignore_index=True)