In [None]:
%load_ext autoreload
%autoreload 2
import os, sys
sys.path.insert(1, '../')
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.special import expit
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import seaborn as sns
import warnings
import pdb
from scipy.stats import norm

## Read and preprocess data

In [None]:
# Read raw data
raw_df = pd.read_csv('./data.csv')

# Process raw data
df = raw_df[raw_df.Year1 != 'tdo'].copy()
df.Year1 = df.Year1.astype(float)


Y_all = (((df.Type1.astype(str) != 'nan') & ((df.Year1 >= 2000) & (df.Year1 <= 2015))) |
      ((df.Type2.astype(str) != 'nan') & ((df.Year2 >= 2000) & (df.Year2 <= 2015))) |
      ((df.Type3.astype(str) != 'nan') & ((df.Year3 >= 2000) & (df.Year3 <= 2015))) ).astype(float).to_numpy()

X_all = np.stack([
        df.tree_canopy_cover_2015/100,
        df.tree_canopy_cover_2000/100
    ], axis=1)

## Implementation of classical inference, PPI with data splitting, and cross-prediction

In [None]:
def trial(X, Y, n, alpha):
    # one trial; randomly splits data into labeled and unlabeled data and then runs baselines
    
    X_labeled, X_unlabeled, Y_labeled, Y_unlabeled = train_test_split(X, Y, train_size=n)

    no_debiasing_int = no_debiasing_interval(X_labeled, X_unlabeled, Y_labeled, alpha) # does folds but no debiasing

    no_fold_int = no_fold_interval(X_labeled, X_unlabeled, Y_labeled, alpha) 
    
    return no_debiasing_int, no_fold_int

In [None]:
def no_debiasing_interval(X_labeled, X_unlabeled, Y_labeled, alpha, K = 10):
    
    n = X_labeled.shape[0]
    N = X_unlabeled.shape[0]
    
    fold_n = int(n/K)
    
    Yhat_unlabeled = np.zeros(N)
    
    for j in range(K):
    
        train_ind = np.delete(range(n),range(j*fold_n,(j+1)*fold_n))
        X_train = X_labeled[train_ind,:]
        Y_train = Y_labeled[train_ind]

        # use train data to train a classifier
        cls = HistGradientBoostingClassifier(max_iter=100,max_depth=2).fit(
            X_train,
            Y_train
        )


        Yhat_unlabeled += cls.predict_proba(X_unlabeled)[:,1]/K
        
    
    thetahat = np.mean(Yhat_unlabeled)
    
    halfwidth = norm.ppf(1-alpha/2) * np.std(Yhat_unlabeled)/np.sqrt(N)
    
    return [thetahat - halfwidth, thetahat + halfwidth]

In [None]:
def no_fold_interval(X_labeled, X_unlabeled, Y_labeled, alpha):
    
    n = X_labeled.shape[0]
    N = X_unlabeled.shape[0]
    
    
    cls = HistGradientBoostingClassifier(max_iter=100,max_depth=5).fit(
            X_labeled,
            Y_labeled
        )
    
    Yhat_unlabeled = cls.predict_proba(X_unlabeled)[:,1]
    Yhat_labeled = cls.predict_proba(X_labeled)[:,1]
    
    
    thetahat = np.mean(Yhat_unlabeled) - np.mean(Yhat_labeled - Y_labeled) 
    
    sigma_hat = np.sqrt(np.var(Yhat_unlabeled)/N + np.var(Yhat_labeled - Y_labeled)/n)
    
    halfwidth = norm.ppf(1-alpha/2) * sigma_hat
    
    return [thetahat - halfwidth, thetahat + halfwidth]

## Construct confidence intervals

In [None]:
# this is a mean estimation problem

num_trials = 10
alpha = 0.1
ps = [0.1, 0.2, 0.3] # fraction of data with labels

theta_true = np.mean(Y_all) # we treat empirical mean over whole data set as ground truth

df_list = []
        
# store results
columns = ["lb","ub","coverage","estimator","n"]

filename = "./deforestation_results/simulation_results_heuristic.csv"

results = []

for p in ps:
        
        n = int(p*len(Y_all))

        
        for i in range(num_trials):
            ci_nodebiasing, ci_nofold = trial(X_all, Y_all, n, alpha)
                
            temp_df = pd.DataFrame(np.zeros((2,len(columns))), columns=columns)
            temp_df.loc[0] = ci_nodebiasing[0], ci_nodebiasing[1], (ci_nodebiasing[0] <= theta_true) & (theta_true <= ci_nodebiasing[1]), "no debiasing", n
            temp_df.loc[1] = ci_nofold[0], ci_nofold[1], (ci_nofold[0] <= theta_true) & (theta_true <= ci_nofold[1]), "no folds", n
            results += [temp_df]

df = pd.concat(results)
df["width"] = df["ub"] - df["lb"]
df_list += [df]
os.makedirs('./deforestation_results/', exist_ok=True)
        
final_df = pd.concat(df_list, ignore_index=True)

# save data
final_df.to_csv(filename)

## Plot results

In [None]:
alpha=0.1
col = np.array([sns.color_palette("Set2")[1], sns.color_palette("Set2")[2], sns.color_palette("Set2")[0]])
sns.set_theme(font_scale=1.4, style='white', palette=col, rc={'lines.linewidth': 3})
# defaults = {'n': 100, 'beta': 0}
fig, axs = plt.subplots(nrows=1, ncols=2, figsize=(10,3.3))
sns.lineplot(ax=axs[0],data=final_df, x='n', y='coverage', hue='estimator', alpha=0.9, errorbar=None, marker="*", markersize=14)
sns.lineplot(ax=axs[1],data=final_df, x='n', y='width', hue='estimator', alpha=0.9, marker="*", markersize=14)

axs[0].axhline(1-alpha, color="#888888", linestyle='dashed', zorder=1, alpha=0.9)
handles, labels = axs[1].get_legend_handles_labels()
axs[1].legend(handles=handles, labels=labels)
axs[0].get_legend().remove()
axs[0].set_ylim([0.0,1])

for i in range(2):
    for j in range(2):
        axs[i].lines[j].set_linestyle("--")


sns.despine(top=True, right=True)
plt.tight_layout()

# save plot
plt.savefig('./deforestation_results/heuristics_comparison.pdf')
plt.show()

In [None]:
# for reading data after it has been saved
datadir = './deforestation_results/'
filenames = os.listdir(datadir)
data = [ pd.read_csv(os.path.join(datadir, fn)) for fn in filenames if 'heuristic.csv' in fn ]
final_df = pd.concat(data, axis=0, ignore_index=True)