In [None]:
%load_ext autoreload
%autoreload 2
import os, sys
sys.path.insert(1, '../')
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.special import expit
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import seaborn as sns
import pdb
from scipy.stats import norm
import matplotlib.patheffects as pe

## Read and preprocess data

In [None]:
# Read raw data
raw_df = pd.read_csv('./data.csv')

# Process raw data
df = raw_df[raw_df.Year1 != 'tdo'].copy()
df.Year1 = df.Year1.astype(float)


Y_all = (((df.Type1.astype(str) != 'nan') & ((df.Year1 >= 2000) & (df.Year1 <= 2015))) |
      ((df.Type2.astype(str) != 'nan') & ((df.Year2 >= 2000) & (df.Year2 <= 2015))) |
      ((df.Type3.astype(str) != 'nan') & ((df.Year3 >= 2000) & (df.Year3 <= 2015))) ).astype(float).to_numpy()

X_all = np.stack([
        df.tree_canopy_cover_2015/100,
        df.tree_canopy_cover_2000/100
    ], axis=1)

## Implementation of classical inference, PPI with data splitting, and cross-prediction

In [None]:
def trial(X, Y, n, alpha):
    # one trial; randomly splits data into labeled and unlabeled data and then runs baselines
    
    X_labeled, X_unlabeled, Y_labeled, Y_unlabeled = train_test_split(X, Y, train_size=n)

    classical_interval = classical_mean_interval(Y_labeled, alpha) # uses only labeled data
    
    pp_interval = pp_mean_interval(X_labeled, X_unlabeled, Y_labeled, alpha, int(0.1*n)) # runs PPI after data splitting
    
    cpp_interval = cross_prediction_mean_interval(X_labeled, X_unlabeled, Y_labeled, alpha) # cross-prediction

    return classical_interval, pp_interval, cpp_interval

In [None]:
def pp_mean_interval(X_labeled, X_unlabeled, Y_labeled, alpha, n_tr):
    # performs data splitting and then runs PPI with CLT interval
    
    n = X_labeled.shape[0]
    N = X_unlabeled.shape[0]
    
    X_train, X_val, Y_train, Y_val = train_test_split(X_labeled, Y_labeled, train_size=n_tr)
    

    
    cls = HistGradientBoostingClassifier(max_iter=100,max_depth=2).fit(
            X_train,
            Y_train
        )
    Yhat_unlabeled = cls.predict_proba(X_unlabeled)[:,1]
    Yhat_val = cls.predict_proba(X_val)[:,1]
    
    thetaPP = np.mean(Yhat_unlabeled) + np.mean(Y_val - Yhat_val)
    
    sigma_hat = np.sqrt(np.var(Yhat_unlabeled)/N + np.var(Y_val - Yhat_val)/(n - n_tr))
    
    halfwidth = norm.ppf(1-alpha/2) * sigma_hat
    
    return [thetaPP - halfwidth, thetaPP + halfwidth]

In [None]:
def cross_prediction_mean_interval(X_labeled, X_unlabeled, Y_labeled, alpha, K = 10):
    # cross-prediction
    
    n = X_labeled.shape[0]
    N = X_unlabeled.shape[0]
    
    fold_n = int(n/K)
    
    Yhat_labeled = np.zeros(n)
    Yhat_unlabeled = np.zeros(N)
    Yhat_avg_labeled = np.zeros(n)
    
    for j in range(K):
    
        X_val = X_labeled[j*fold_n:(j+1)*fold_n,:]
        Y_val = Y_labeled[j*fold_n:(j+1)*fold_n]
        train_ind = np.delete(range(n),range(j*fold_n,(j+1)*fold_n))
        X_train = X_labeled[train_ind,:]
        Y_train = Y_labeled[train_ind]

        # use train data to train a classifier
        cls = HistGradientBoostingClassifier(max_iter=100,max_depth=2).fit(
            X_train,
            Y_train
        )


        Yhat_unlabeled += cls.predict_proba(X_unlabeled)[:,1]/K
        Yhat_labeled[j*fold_n:(j+1)*fold_n] = cls.predict_proba(X_val)[:,1]
    
    
    thetaPP = np.mean(Yhat_unlabeled) + np.mean(Y_labeled - Yhat_labeled)


    var_hat = bootstrap_variance(X_labeled, X_unlabeled, Y_labeled, n-fold_n, thetaPP)
    
    
    halfwidth = norm.ppf(1-alpha/2) * np.sqrt(var_hat)
    
    return [thetaPP - halfwidth, thetaPP + halfwidth]

In [None]:
def bootstrap_variance(X_labeled, X_unlabeled, Y_labeled, train_n, thetaPP, B = 30):
    # estimates the asymptotic variance of cross-prediction
    
    
    n = X_labeled.shape[0]
    N = X_unlabeled.shape[0]

    Yhat_labeled = np.zeros(n)
    Yhat_unlabeled = np.zeros(N)
    
    
    grad_diff = np.zeros(int((n-train_n)*B))
    
    for j in range(B):
        
        train_ind = np.random.choice(range(n),train_n)
        X_train = X_labeled[train_ind,:]
        Y_train = Y_labeled[train_ind]
        
        # use train data to train a classifier
        cls = HistGradientBoostingClassifier(max_iter=100,max_depth=2).fit(
            X_train,
            Y_train
        )


        Yhat_unlabeled += cls.predict_proba(X_unlabeled)[:,1]/B
        
        other_inds = np.delete(range(n), train_ind)[:n-train_n]
        Yhat_labeled = cls.predict_proba(X_labeled[other_inds,:])[:,1]

        grad_diff[j*(n-train_n):(j+1)*(n-train_n)] = Yhat_labeled - Y_labeled[other_inds]
    
    var_unlabeled = np.var(Yhat_unlabeled)
    var_labeled = np.var(grad_diff)
    
    var_hat = var_unlabeled/N + var_labeled/n
    
    return var_hat

In [None]:
def classical_mean_interval(Y, alpha):
    # classical CLT interval
    n = len(Y)
    point_estimate = np.mean(Y)
    halfwidth = norm.ppf(1-alpha/2) * np.sqrt(np.var(Y)/n)
    return [point_estimate - halfwidth, point_estimate + halfwidth]

## Construct confidence intervals

In [None]:
num_trials = 100
alpha = 0.1
ps = [0.1, 0.2, 0.3] # fraction of data with labels

theta_true = np.mean(Y_all)

df_list = []
        
# store results
columns = ["lb","ub","coverage","estimator","n"]

filename = "./deforestation_results/simulation_results.csv"

results = []

for p in ps:
        
        n = int(p*len(Y_all))
        print(n)

        
        for i in range(num_trials):
            ci, ppi, cppi = trial(X_all, Y_all, n, alpha)
                
            temp_df = pd.DataFrame(np.zeros((3,len(columns))), columns=columns)
            temp_df.loc[0] = cppi[0], cppi[1], (cppi[0] <= theta_true) & (theta_true <= cppi[1]), "cross-prediction", n
            temp_df.loc[1] = ci[0], ci[1], (ci[0] <= theta_true) & (theta_true <= ci[1]), "classical", n
            temp_df.loc[2] = ppi[0], ppi[1], (ppi[0] <= theta_true) & (theta_true <= ppi[1]), "PPI", n
            results += [temp_df]

df = pd.concat(results)
df["width"] = df["ub"] - df["lb"]
df_list += [df]
os.makedirs('./deforestation_results/', exist_ok=True)
        
final_df = pd.concat(df_list, ignore_index=True)

# save data
final_df.to_csv(filename)

## Plot results

In [None]:
alpha=0.1
col = np.array([sns.color_palette("Set2")[1], sns.color_palette("Set2")[2], sns.color_palette("Set2")[0], sns.color_palette("Set2")[3], sns.color_palette("Set2")[4]])
sns.set_theme(font_scale=1.4, style='white', palette=col, rc={'lines.linewidth': 3})
# defaults = {'n': 100, 'beta': 0}
fig, axs = plt.subplots(nrows=1, ncols=2, figsize=(10,3.3))
sns.lineplot(ax=axs[0],data=final_df, x='n', y='coverage', hue='estimator', alpha=0.9, errorbar=None, marker="*", markersize=14)
sns.lineplot(ax=axs[1],data=final_df, x='n', y='width', hue='estimator', alpha=0.9, marker="*", markersize=14)

axs[0].axhline(1-alpha, color="#888888", linestyle='dashed', zorder=1, alpha=0.9)
handles, labels = axs[1].get_legend_handles_labels()
axs[1].legend(handles=handles, labels=labels)
axs[0].get_legend().remove()
axs[0].set_ylim([0.0,1])

for i in [0,1]:
    for j in range(3):
        axs[i].lines[j].set_linestyle("--")

sns.despine(top=True, right=True)
plt.tight_layout()

# save plot
plt.savefig('./deforestation_results/deforestation_comparison.pdf')
plt.show()

In [None]:
# for reading data after it has been saved
datadir = './deforestation_results/'
filenames = os.listdir(datadir)
data = [ pd.read_csv(os.path.join(datadir, fn)) for fn in filenames if 'simulation_results.' in fn ]
final_df = pd.concat(data, axis=0, ignore_index=True)

In [None]:
num_ints = 5
inds = np.random.choice(num_trials, num_ints)
CPPI_ints = []
PPI_ints = []
classical_ints = []
n = 319

for i in range(num_ints):
    ind = inds[i]
    CPPI_ints.append([final_df[(final_df.estimator == "cross-prediction") & (final_df.n == n)].iloc[ind].lb, final_df[(final_df.estimator == "cross-prediction") & (final_df.n == n)].iloc[ind].ub])
    PPI_ints.append([final_df[(final_df.estimator == "PPI") & (final_df.n == n)].iloc[ind].lb, final_df[(final_df.estimator == "PPI") & (final_df.n == n)].iloc[ind].ub])
    classical_ints.append([final_df[(final_df.estimator == "classical") & (final_df.n == n)].iloc[ind].lb, final_df[(final_df.estimator == "classical") & (final_df.n == n)].iloc[ind].ub])

## Intro figure

In [None]:
alpha=0.1

gap = 0.03
start1 = 0.5
start2 = 0.35
start3 = 0.2
linewidth_inner = 5
linewidth_outer = 7

k = len(CPPI_ints)

col = np.array([sns.color_palette("Set2")[1], sns.color_palette("Set2")[2],  sns.color_palette("Set2")[0]])
sns.set_theme(font_scale=1.4, style='white', palette=col, rc={'lines.linewidth': 3})

fig, axs = plt.subplots(nrows=1, ncols=3, figsize=(15,3.3))


axs[0].axvline(theta_true, color='gray', linestyle='dashed')

for i in reversed(range(k)):
    
    if i == 0:
        axs[0].plot([CPPI_ints[i][0] , CPPI_ints[i][1] ],[start1+i*gap,start1+i*gap], linewidth=linewidth_inner, color=lighten_color(col[0],0.6), path_effects=[pe.Stroke(linewidth=linewidth_outer, offset=(-1,0), foreground=col[0]), pe.Stroke(linewidth=linewidth_outer, offset=(1,0), foreground=col[0]), pe.Normal()],  solid_capstyle='butt')
        axs[0].plot([PPI_ints[i][0] , PPI_ints[i][1] ],[start3+i*gap, start3+i*gap], linewidth=linewidth_inner, color=lighten_color(col[2],0.6), path_effects=[pe.Stroke(linewidth=linewidth_outer, offset=(-1,0), foreground=col[1]), pe.Stroke(linewidth=linewidth_outer, offset=(1,0), foreground=col[2]), pe.Normal()],  solid_capstyle='butt')
        axs[0].plot([classical_ints[i][0] , classical_ints[i][1] ],[start2+i*gap, start2+i*gap], linewidth=linewidth_inner, color=lighten_color(col[1],0.6), path_effects=[pe.Stroke(linewidth=linewidth_outer, offset=(-1,0), foreground=col[1]), pe.Stroke(linewidth=linewidth_outer, offset=(1,0), foreground=col[1]), pe.Normal()],  solid_capstyle='butt')
    if i > 0:
        axs[0].plot([CPPI_ints[i][0], CPPI_ints[i][1]],[start1+i*gap,start1+i*gap], linewidth=linewidth_inner, color= lighten_color(col[0],0.6), path_effects=[pe.Stroke(linewidth=linewidth_outer, offset=(-1,0), foreground=col[0]), pe.Stroke(linewidth=linewidth_outer, offset=(1,0), foreground=col[0]), pe.Normal()], solid_capstyle='butt')
        axs[0].plot([PPI_ints[i][0] , PPI_ints[i][1]],[start3+i*gap, start3+i*gap], linewidth=linewidth_inner, color=lighten_color(col[2],0.6), path_effects=[pe.Stroke(linewidth=linewidth_outer, offset=(-1,0), foreground=col[1]), pe.Stroke(linewidth=linewidth_outer, offset=(1,0), foreground=col[2]), pe.Normal()], solid_capstyle='butt')
        axs[0].plot([classical_ints[i][0] , classical_ints[i][1]],[start2+i*gap, start2+i*gap], linewidth=linewidth_inner, color=lighten_color(col[1],0.6), path_effects=[pe.Stroke(linewidth=linewidth_outer, offset=(-1,0), foreground=col[1]), pe.Stroke(linewidth=linewidth_outer, offset=(1,0), foreground=col[1]), pe.Normal()], solid_capstyle='butt')
    
axs[0].set_xlabel('fraction of deforested areas', fontsize=16)
axs[0].set_yticks([])





sns.lineplot(ax=axs[1],data=final_df, x='n', y='coverage', hue='estimator', alpha=0.9, errorbar=None, marker="*", markersize=14)
sns.lineplot(ax=axs[2],data=final_df, x='n', y='width', hue='estimator', alpha=0.9, marker="*", markersize=14)


axs[1].axhline(1-alpha, color="#888888", linestyle='dashed',  alpha=0.8)
handles, labels = axs[1].get_legend_handles_labels()
axs[2].legend(handles=handles, labels=labels)
axs[1].get_legend().remove()
axs[1].set_ylim([0.6,1])

for i in [1,2]:
    for j in range(3):
        axs[i].lines[j].set_linestyle("--")





# axs[0].ylabel("")








sns.despine(top=True, right=True)
plt.tight_layout()





# save plot
plt.savefig('./deforestation_results/deforestation_intro.pdf')
plt.show()

In [None]:
def lighten_color(color, amount=0.5):
    """
    Lightens the given color by multiplying (1-luminosity) by the given amount.
    Input can be matplotlib color string, hex string, or RGB tuple.

    Examples:
    >> lighten_color('g', 0.3)
    >> lighten_color('#F034A3', 0.6)
    >> lighten_color((.3,.55,.1), 0.5)
    """
    import matplotlib.colors as mc
    import colorsys
    try:
        c = mc.cnames[color]
    except:
        c = color
    c = colorsys.rgb_to_hls(*mc.to_rgb(c))
    return colorsys.hls_to_rgb(c[0], 1 - amount * (1 - c[1]), c[2])