### Import necessary packages

In [None]:
%load_ext autoreload
%autoreload 2
import pyreadstat
import numpy as np
from sklearn.model_selection import train_test_split
import xgboost as xgb
import scipy
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os, sys
from scipy.stats import norm, bernoulli
from ppi_py.datasets import load_dataset
import matplotlib.patheffects as pe
from utils import make_width_coverage_plot, make_budget_plot
import warnings; warnings.simplefilter('ignore')

### Import the AlphaFold data set

Load the data. The data set contains true indicators of disorder (```Y```), predicted indicators of disorder (```Yhat```), and indicators of a PTM (```phosphorylated```, ```ubiquitinated```, or ```acetylated```). Predictions of disorder are made based on AlphaFold predictions of structure.

In [None]:
dataset_folder = "./alphafold/data/"
data = load_dataset(dataset_folder, "alphafold")
Y_total = data["Y"]
Yhat_total = data["Yhat"]
Z = data["phosphorylated"].astype(bool)

### Problem setup

Compute ground-truth value of the odds ratio. Specify range of budgets in fractional form $n_b/n$, error level $\alpha$, and number of trials.

In [None]:
# Split into two subgroups for odds ratio calculation
Y0, Y1 = Y_total[~Z], Y_total[Z]
Yhat0, Yhat1 = Yhat_total[~Z], Yhat_total[Z]
n0 = Y0.shape[0]
n1 = Y1.shape[0]
n = len(Y_total)

# True odds ratio
mu0 = Y0.mean()
mu1 = Y1.mean()
odds_ratio = (mu1 / (1 - mu1)) / (mu0 / (1 - mu0))

In [None]:
budgets = np.linspace(0.01, 0.2, 20)
alpha = 0.1
num_trials = 1000

### Odds ratio confidence interval

Parameters ```lhat0``` and ```lhat1``` determine the baseline (active, uniform, or classical).

In [None]:
def odds_ratio_ci(Y0, Yhat0, Y1, Yhat1, probs0, probs1, alpha, budget, lhat0=None, lhat1=None):
    tau0 = 0.5
    tau1 = 0.5
    n0 = Y0.shape[0]
    n1 = Y1.shape[0]    
    probs0 = np.clip((1-tau0)*probs0 + tau0*budget, 0, 1)
    probs1 = np.clip((1-tau1)*probs1 + tau1*budget, 0, 1)
    xi0 = bernoulli.rvs(probs0)
    mu0_hat = np.mean(lhat0*Yhat0 + (Y0 - lhat0*Yhat0)*xi0/probs0)
    xi1 = bernoulli.rvs(probs1)
    mu1_hat = np.mean(lhat1*Yhat1 + (Y1 - lhat1*Yhat1)*xi1/probs1)
    pointest_log = np.log(mu1_hat/(1-mu1_hat)) - np.log(mu0_hat/(1-mu0_hat))
    var_mu0_hat = np.var(lhat0*Yhat0 + (Y0 - lhat0*Yhat0)*xi0/probs0)
    var_mu1_hat = np.var(lhat1*Yhat1 + (Y1 - lhat1*Yhat1)*xi1/probs1)
    var0 = var_mu0_hat/((mu0_hat*(1-mu0_hat))**2)
    var1 = var_mu1_hat/((mu1_hat*(1-mu1_hat))**2)
    p0 = n0/(n0+n1)
    p1 = n1/(n0+n1)
    var = 1/p0*var0 + 1/p1*var1
    width_log = norm.ppf(1-alpha/2)*np.sqrt(var/(n0+n1))
    return np.exp(pointest_log - width_log), np.exp(pointest_log + width_log)

### Main experiment

Forms dataframe ```df``` with experiment results. The columns in the dataframe are:

- ```lb``` - interval lower bound

- ```ub``` - interval upper bound

- ```interval width``` - equal to ```ub``` - ```lb```

- ```coverage``` - 0/1 indicator of whether or not interval covered target

- ```estimator``` - one of ```classical```, ```uniform```, or ```active```

- ```$n_b$``` - budget size 

In [None]:
results = []
columns = ["lb", "ub", "interval width", "coverage", "estimator", "$n_b$"]
temp_df = pd.DataFrame(np.zeros((3,len(columns))), columns=columns)

for j in range(len(budgets)):
    budget = budgets[j]
    
    for i in range(num_trials):
        uncertainty0 = np.minimum(Yhat0, 1-Yhat0)
        uncertainty1 = np.minimum(Yhat1, 1-Yhat1)
        eta0 = budget / np.mean(uncertainty0)
        probs0 = eta0*uncertainty0
        eta1 = budget / np.mean(uncertainty1)
        probs1 = eta1*uncertainty1
        
        [l, u] = odds_ratio_ci(Y0, Yhat0, Y1, Yhat1, probs0, probs1, alpha, budget, lhat0=1, lhat1=1)
        coverage_active = (odds_ratio >= l)*(odds_ratio <= u)   
        temp_df.loc[0] = l, u, u-l, coverage_active, "active", int(budget*n)  

        [l, u] = odds_ratio_ci(Y0, Yhat0, Y1, Yhat1, budget*np.ones(n0), budget*np.ones(n1), alpha, budget, lhat0=1, lhat1=1)
        coverage_unif = (odds_ratio >= l)*(odds_ratio <= u)  
        temp_df.loc[1] = l, u, u-l, coverage_unif, "uniform", int(budget*n)
        
        [l, u] = odds_ratio_ci(Y0, Yhat0, Y1, Yhat1, budget*np.ones(n0), budget*np.ones(n1), alpha, budget, lhat0=0, lhat1=0)
        coverage_class = (odds_ratio >= l)*(odds_ratio <= u)  
        temp_df.loc[2] = l, u, u-l, coverage_class, "classical", int(budget*n)

        results += [temp_df.copy()]
df = pd.concat(results,ignore_index=True)

### Plot coverage and interval width

In [None]:
make_width_coverage_plot(df, "odds ratio", "widths_and_coverage_alphafold.pdf", odds_ratio, n_example_ind = 3)

In [None]:
make_budget_plot(df, "AlphaFold", "budget_alphafold.pdf")