### Import necessary packages

In [None]:
%load_ext autoreload
%autoreload 2
import pyreadstat
import numpy as np
from sklearn.model_selection import train_test_split
import xgboost as xgb
import scipy
import pandas as pd
import seaborn as sns
from scipy.stats import norm, bernoulli
import matplotlib.pyplot as plt
import os, sys
import matplotlib.patheffects as pe
from utils import make_width_coverage_plot, make_budget_plot
import warnings; warnings.simplefilter('ignore')
from tqdm import tqdm

### Import Pew ATP Wave 79 dataset

The dataset is available at: https://www.pewresearch.org/science/dataset/american-trends-panel-wave-79/

In [None]:
data, meta = pyreadstat.read_sav("pew/ATPW79.sav")

### Problem setup

Specify estimand of interest (average approval of Biden's messaging or average approval of Trump's messaging) and compute ground-truth value of the estimand. Split data into labeled and unlabeled subsets. Specify range of budgets in fractional form $n_b/n$.

In [None]:
question = "ELECTBIDENMSSG_W79" # can choose ELECTBIDENMSSG_W79 or ELECTTRUMPMSSG_W79 
idx_keep = np.where(data[question] != 99)[0]
Y_all = data[question].to_numpy()[idx_keep] < 2.5
X_all = data[['F_PARTYSUM_FINAL', 'COVIDFOL_W79','COVIDTHREAT_a_W79','COVIDTHREAT_b_W79','COVIDTHREAT_c_W79', 'COVIDTHREAT_d_W79','COVIDMASK1_W79', 'COVID_SCI6E_W79', 'F_EDUCCAT', 'F_AGECAT']].to_numpy()[idx_keep]
theta_true = np.mean(Y_all)
X_train, X, y_train, Y = train_test_split(X_all, Y_all, train_size=10)

In [None]:
budgets = np.linspace(0.15, 0.4, 10)

### Train XGBoost model

Train XGBoost model on labeled data.

In [None]:
dtrain = xgb.DMatrix(X_train, label=y_train)
tree = xgb.train({'eta': 0.001, 'max_depth': 5, 'objective': 'reg:logistic'}, dtrain, 3000)
Yhat = tree.predict(xgb.DMatrix(X))

### Main experiment

Forms dataframe ```df``` with experiment results. The columns in the dataframe are:

- ```lb``` - interval lower bound

- ```ub``` - interval upper bound

- ```interval width``` - equal to ```ub``` - ```lb```

- ```coverage``` - 0/1 indicator of whether or not interval covered target

- ```estimator``` - one of ```active (w/ fine-tuning)```, ```uniform```, or ```active (no fine-tuning)```

- ```$n_b$``` - budget size 

In [None]:
n = len(Y)
num_trials = 100
alpha = 0.1
tau = 0.5

# fine-tuning params
batch_size = 100
steps = 500
greedy_steps = 100

In [None]:
results = []
columns = ["lb", "ub", "interval width", "coverage", "estimator", "$n_b$"]
temp_df = pd.DataFrame(np.zeros((3,len(columns))), columns=columns)
Yhat_train = tree.predict(xgb.DMatrix(X_train))
C_init = np.mean(np.minimum(Yhat_train, 1-Yhat_train))
C = C_init

for j in tqdm(range(len(budgets))):
    budget = budgets[j]
    budget_window = int(greedy_steps/budget) # how often we use up remaing budget

    for k in tqdm(range(num_trials)):

        perm = np.random.choice(range(n), n, replace=False)
        Y = Y[perm]
        Yhat = Yhat[perm]
        X = X[perm]

        increments_active = []
        increments_nofinetune = []
        tree_new = tree.copy()
        finetune_inds = []
        Yhat_new = tree_new.predict(xgb.DMatrix(X))
        num_collected_active = 0
        num_collected_nofinetune = 0
        
        for i in range(n):
            raw_prob = np.clip(np.minimum(Yhat_new[i], 1-Yhat_new[i]) / C * budget, 0, np.maximum(0,(i+1)*budget - num_collected_active))
            if i % budget_window >= budget_window - greedy_steps:
                raw_prob = (i+1)*budget - num_collected_active
            prob = (1-tau)*np.clip(raw_prob, 0, 1) + tau*budget
            xi = bernoulli.rvs(prob)
            if xi == 1:
                finetune_inds.append(i)
                num_collected_active +=1
            increments_active.append(Yhat_new[i] + (Y[i] - Yhat_new[i])*xi/prob)
            
            if len(finetune_inds) == batch_size:
                finetune_data = xgb.DMatrix(X[finetune_inds], label=Y[finetune_inds])
                tree_new = xgb.train({'eta': 0.001, 'max_depth': 5, 'objective': 'reg:logistic'}, finetune_data, steps, xgb_model=tree_new)
                Yhat_new = tree_new.predict(xgb.DMatrix(X))
                Yhat_C = tree_new.predict(xgb.DMatrix(X[:(i+1)]))
                C = np.mean(np.minimum(Yhat_C, 1-Yhat_C))
                finetune_inds = []
            
            raw_prob_nofinetune = np.clip(np.minimum(Yhat[i], 1-Yhat[i]) / C_init * budget, 0, np.maximum(0,(i+1)*budget - num_collected_nofinetune))
            if i % budget_window >= budget_window - greedy_steps:
                raw_prob_nofinetune = (i+1)*budget - num_collected_nofinetune
            prob_nofinetune = (1-tau)*np.clip(raw_prob_nofinetune, 0, 1) + tau*budget

            # couple sampling decisions to minimize variance in results
            if prob_nofinetune > prob:
                if xi == 1:
                    xi_nofinetune = 1
                else:
                    xi_nofinetune = bernoulli.rvs((prob_nofinetune - prob)/(1-prob))
            else:
                if xi == 0:
                    xi_nofinetune = 0
                else:
                    xi_nofinetune = bernoulli.rvs(prob_nofinetune/prob)
            if xi_nofinetune == 1:
                num_collected_nofinetune += 1
                
            increments_nofinetune.append(Yhat[i] + (Y[i] - Yhat[i])*xi_nofinetune/prob_nofinetune)
        
        pointest_active = np.mean(increments_active)
        pointest_active_std = np.std(increments_active) / np.sqrt(n)
        pointest_nofinetune = np.mean(increments_nofinetune)
        pointest_nofinetune_std = np.std(increments_nofinetune) / np.sqrt(n)
            
        width_active = norm.ppf(1-alpha/2)*pointest_active_std 
        coverage_active = (theta_true >= pointest_active - width_active)*(theta_true <= pointest_active + width_active)   
        temp_df.loc[0] = pointest_active - width_active, pointest_active + width_active, 2*width_active, coverage_active, "active (w/ fine-tuning)", int(budget*n)

        xi_unif = bernoulli.rvs([budget]*n)
        pointest_unif = np.mean(Yhat + (Y - Yhat)*xi_unif/budget)
        pointest_unif_std = np.std(Yhat + (Y - Yhat)*xi_unif/budget)/np.sqrt(n)
        width_unif = norm.ppf(1-alpha/2)*pointest_unif_std
        coverage_unif = (theta_true >= pointest_unif - width_unif)*(theta_true <= pointest_unif + width_unif)
        temp_df.loc[1] = pointest_unif - width_unif, pointest_unif + width_unif, 2*width_unif, coverage_unif, "uniform", int(budget*n)
        
        width_nofinetune = norm.ppf(1-alpha/2)*pointest_nofinetune_std 
        coverage_nofinetune = (theta_true >= pointest_nofinetune - width_nofinetune)*(theta_true <= pointest_nofinetune + width_nofinetune)   
        temp_df.loc[2] = pointest_nofinetune - width_nofinetune, pointest_nofinetune + width_nofinetune, 2*width_nofinetune, coverage_nofinetune, "active (no fine-tuning)", int(budget*n)

        results += [temp_df.copy()]
df = pd.concat(results,ignore_index=True)

### Plot coverage and interval width

In [None]:
make_width_coverage_plot(df, "approval rate", "widths_and_coverage_pew79_biden_seq.pdf", theta_true, num_trials=num_trials, n_example_ind=3, finetuning=True, more_precision=True)

### Plot budget saved

In [None]:
make_budget_plot(df, "Post-election survey research (Biden)", "budget_pew79_biden_seq.pdf", finetuning=True)