In [14]:
import os
import math
import random
import itertools
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.rcParams.update({'font.size': 11})
import matplotlib.ticker as ticker
# PyTorch
import torch
# GPyTorch
from gpytorch.mlls import ExactMarginalLogLikelihood
# BOTorch
import botorch
from botorch.models import SingleTaskGP
from botorch.models.transforms import Normalize, Standardize
from botorch.fit import fit_gpytorch_mll
from botorch.acquisition import LogExpectedImprovement
from botorch.optim import optimize_acqf

In [15]:
################################################################################
def check_epochs(df, n, batch_size=128, steps=6000, drop_last=True):
    num_batches = math.floor(n/batch_size) if drop_last else math.ceil(n/batch_size)
    epochs = int(steps/num_batches)
    return df.shape[0] == epochs

def print_job(alpha, beta, dataset, dataset_dir, experiments_dir, lr_0, 
              method, model, model_arch, n, prior_dir, prior_type, random_state, 
              save, tune):
    model_name = f"{model}_alpha={alpha}_beta={beta}_lr_0={lr_0}_n={n}_random_state={random_state}"
    if os.path.exists(f"{experiments_dir}/{model_name}.csv"):
        temp_df = pd.read_csv(f"{experiments_dir}/{model_name}.csv")
        n_train = n - int((1/5) * n) if tune else n
        if check_epochs(temp_df, n_train, batch_size=min(32, n_train), steps=12000, drop_last=False):
            return

    command = (
        f"python ../src/main_text_classifiers.py "
        f"--alpha={alpha} "
        "--batch_size=32 "
        f"--beta={beta} "
        f"--dataset=\"{dataset}\" "
        f"--dataset_dir=\"{dataset_dir}\" "
        f"--experiments_dir=\"{experiments_dir}\" "
        f"--lr_0={lr_0} "
        f"--method=\"{method}\" "
        f"--model=\"{model}\" "
        f"--model_arch=\"{model_arch}\" "
        f"--model_name=\"{model_name}\" "
        f"--n={n} "
        "--num_workers=0 "
        f"--prior_dir=\"{prior_dir}\" "
        f"--prior_type=\"{prior_type}\" "
        f"--random_state={random_state} "
        f"{'--save' if save else ''}"
        f"{'--tune' if tune else ''}"
    )
    
    print(f"    '{command}'")
    
def get_runtime(alpha, beta, experiments_dir, lr_0, model, n, random_state, tune):
    model_name = f"{model}_alpha={alpha}_beta={beta}_lr_0={lr_0}_n={n}_random_state={random_state}"
    if not os.path.exists(f"{experiments_dir}/{model_name}.csv"):
        return 0.0
        #raise FileNotFoundError(f"Expected file not found: {experiments_dir}/{model_name}.csv")
    df = pd.read_csv(f"{experiments_dir}/{model_name}.csv")
    n_train = n - int((1/5) * n) if tune else n
    if not check_epochs(df, n_train, batch_size=min(32, n_train), steps=12000, drop_last=False):
        return 0.0
        #raise RuntimeError(f"Run incomplete: {model_name} did not run for the specified number of epochs")
    return df["train_sec/epoch"].sum()

def get_val_or_test_acc(alpha, beta, experiments_dir, lr_0, model, n, random_state, tune):
    model_name = f"{model}_alpha={alpha}_beta={beta}_lr_0={lr_0}_n={n}_random_state={random_state}"
    if not os.path.exists(f"{experiments_dir}/{model_name}.csv"):
        return 0.0
        #raise FileNotFoundError(f"Expected file not found: {experiments_dir}/{model_name}.csv")
    df = pd.read_csv(f"{experiments_dir}/{model_name}.csv")
    n_train = n - int((1/5) * n) if tune else n
    if not check_epochs(df, n_train, batch_size=min(32, n_train), steps=12000, drop_last=False):
        return 0.0
        #raise RuntimeError(f"Run incomplete: {model_name} did not run for the specified number of epochs")
    return df["val_or_test_acc"].values[-1]
    
def get_val_or_test_nll(alpha, beta, experiments_dir, lr_0, model, n, random_state, tune):
    model_name = f"{model}_alpha={alpha}_beta={beta}_lr_0={lr_0}_n={n}_random_state={random_state}"
    if not os.path.exists(f"{experiments_dir}/{model_name}.csv"):
        return float("inf")
        #raise FileNotFoundError(f"Expected file not found: {experiments_dir}/{model_name}.csv")
    df = pd.read_csv(f"{experiments_dir}/{model_name}.csv")
    n_train = n - int((1/5) * n) if tune else n
    if not check_epochs(df, n_train, batch_size=min(32, n_train), steps=12000, drop_last=False):
        return float("inf")
        #raise RuntimeError(f"Run incomplete: {model_name} did not run for the specified number of epochs")
    return df["val_or_test_nll"].values[-1]

def get_candidate(train_X, train_Y, seed):
    
    torch.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)
    
    gp = SingleTaskGP(
        train_X=train_X,
        train_Y=train_Y,
        input_transform=Normalize(d=3),
        outcome_transform=Standardize(m=1),
    )
    mll = ExactMarginalLogLikelihood(gp.likelihood, gp)
    fit_gpytorch_mll(mll)

    logEI = LogExpectedImprovement(model=gp, best_f=train_Y.max())

    bounds = torch.stack([torch.zeros(3), torch.ones(3)]).to(torch.double)
    candidate, acq_value = optimize_acqf(
      logEI, bounds=bounds, q=1, num_restarts=5, raw_samples=20,
    )
    
    return candidate.detach()

In [16]:
# In queue:

# News-4 n_iters = 12 tuned

In [17]:
# DONE:

# News-4 n_iters = 11 tuned

# News-4 n_iters = 11 retrained

# TODO:

# News-4 n_iters = 12 tuned

# News-4 n_iters = 12 retrained

In [18]:
dataset = "News-4"
dataset_dir = "/cluster/tufts/hugheslab/eharve06/News-4"
model = "l2-sp"
method = "MAP"
model_arch = "BERT-base"
ns = [40000, 120000]
prior_dir = ""
prior_type = ""
random_states = [1001, 2001, 3001]
retrained_experiments_dir = "/cluster/tufts/hugheslab/eharve06/data-emphasized-ELBo/experiments/retrained_News-4_BO"
tuned_experiments_dir = "/cluster/tufts/hugheslab/eharve06/data-emphasized-ELBo/experiments/tuned_News-4_BO"

n_iters = 11
seeds = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

bounds = torch.tensor([[-6, -6, -4], [-2, -2, -1]], dtype=torch.double)

columns = ["alpha", "beta", "lr_0", "n", "n_iter", "random_state", "runtime", "seed", "val_acc", "val_nll", "test_acc", "test_nll"]
news4_bo_df = pd.DataFrame(columns=columns)

for n, random_state, seed in itertools.product(ns, random_states, seeds):

    gen = torch.Generator()
    gen.manual_seed(seed)

    train_X = torch.rand(size=(1, 3,), generator=gen, dtype=torch.double)
    train_X_bounded = (bounds[1] - bounds[0]) * train_X + bounds[0]
    
    alpha, beta, lr_0 = 10**train_X_bounded[0]
    print_job(alpha, beta, dataset, dataset_dir, tuned_experiments_dir, lr_0, method, model, model_arch, n, prior_dir, prior_type, random_state, False, True)

    train_Y = torch.tensor([[-get_val_or_test_nll(10**x[0], 10**x[1], tuned_experiments_dir, 10**x[2], model, n, random_state, True)] for x in train_X_bounded], dtype=torch.float64)
    
    alpha_star, beta_star, lr_0_star = 10**train_X_bounded[torch.argmax(train_Y)]
    print_job(alpha_star, beta_star, dataset, dataset_dir, retrained_experiments_dir, lr_0_star, method, model, model_arch, n, prior_dir, prior_type, random_state, True, False)
    
    runtime = get_runtime(alpha, beta, tuned_experiments_dir, lr_0, model, n, random_state, True)
    runtime += get_runtime(alpha_star, beta_star, retrained_experiments_dir, lr_0_star, model, n, random_state, False)
    
    val_acc = get_val_or_test_acc(alpha, beta, tuned_experiments_dir, lr_0, model, n, random_state, True)
    val_nll = get_val_or_test_nll(alpha, beta, tuned_experiments_dir, lr_0, model, n, random_state, True)
    test_acc = get_val_or_test_acc(alpha_star, beta_star, retrained_experiments_dir, lr_0_star, model, n, random_state, False)
    test_nll = get_val_or_test_nll(alpha_star, beta_star, retrained_experiments_dir, lr_0_star, model, n, random_state, False)

    row = [alpha.item(), beta.item(), lr_0.item(), n, 0, random_state, runtime, seed, val_acc, val_nll, test_acc, test_nll]
    news4_bo_df.loc[len(news4_bo_df)] = row
    
    for i in range(1, n_iters+1):

        candidate = get_candidate(train_X, train_Y, seed)
        candidate_bounded = (bounds[1] - bounds[0]) * candidate + bounds[0]
        train_X = torch.cat([train_X, candidate])
        train_X_bounded = (bounds[1] - bounds[0]) * train_X + bounds[0]
                
        alpha, beta, lr_0 = 10**candidate_bounded[0]        
        print_job(alpha, beta, dataset, dataset_dir, tuned_experiments_dir, lr_0, method, model, model_arch, n, prior_dir, prior_type, random_state, False, True)

        train_Y = torch.tensor([[-get_val_or_test_nll(10**x[0], 10**x[1], tuned_experiments_dir, 10**x[2], model, n, random_state, True)] for x in train_X_bounded], dtype=torch.float64)

        alpha_star, beta_star, lr_0_star = 10**train_X_bounded[torch.argmax(train_Y)]
        #print_job(alpha_star, beta_star, dataset, dataset_dir, retrained_experiments_dir, lr_0_star, method, model, model_arch, n, prior_dir, prior_type, random_state, True, False)
        
        runtime = sum([get_runtime(10**x[0], 10**x[1], tuned_experiments_dir, 10**x[2], model, n, random_state, True) for x in train_X_bounded])
        runtime += get_runtime(alpha_star, beta_star, retrained_experiments_dir, lr_0_star, model, n, random_state, False)
        
        val_acc = get_val_or_test_acc(alpha, beta, tuned_experiments_dir, lr_0, model, n, random_state, True)
        val_nll = get_val_or_test_nll(alpha, beta, tuned_experiments_dir, lr_0, model, n, random_state, True)
        test_acc = get_val_or_test_acc(alpha_star, beta_star, retrained_experiments_dir, lr_0_star, model, n, random_state, False)
        test_nll = get_val_or_test_nll(alpha_star, beta_star, retrained_experiments_dir, lr_0_star, model, n, random_state, False)

        row = [alpha.item(), beta.item(), lr_0.item(), n, i, random_state, runtime, seed, val_acc, val_nll, test_acc, test_nll]
        news4_bo_df.loc[len(news4_bo_df)] = row

news4_bo_df.head(100)

Unnamed: 0,alpha,beta,lr_0,n,n_iter,random_state,runtime,seed,val_acc,val_nll,test_acc,test_nll
0,0.007589,0.000678,0.002389,40000.0,0.0,1001.0,14682.265820,0.0,0.934875,0.020004,0.932500,0.020449
1,0.000001,0.000001,0.100000,40000.0,1.0,1001.0,22301.092462,0.0,0.250000,0.129966,0.932500,0.020449
2,0.004011,0.001500,0.000631,40000.0,2.0,1001.0,29777.094771,0.0,0.927000,0.019971,0.922368,0.021701
3,0.002254,0.010000,0.078326,40000.0,3.0,1001.0,37304.995549,0.0,0.911625,0.025204,0.922368,0.021701
4,0.004851,0.000001,0.100000,40000.0,4.0,1001.0,44933.403602,0.0,0.916750,0.025521,0.922368,0.021701
...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.000001,0.000001,0.001470,40000.0,11.0,1001.0,98403.808607,7.0,0.932750,0.019107,0.930000,0.020371
96,0.001360,0.000193,0.000214,40000.0,0.0,1001.0,14907.553491,8.0,0.912125,0.023456,0.908684,0.024769
97,0.000001,0.000001,0.100000,40000.0,1.0,1001.0,22467.267115,8.0,0.250000,0.129970,0.908684,0.024769
98,0.002509,0.001321,0.000134,40000.0,2.0,1001.0,30277.997782,8.0,0.905750,0.025177,0.908684,0.024769


In [19]:
#news4_bo_df.to_csv("News-4_BERT-base_BO.csv", index=False)
news4_bo_df = pd.read_csv("News-4_BERT-base_BO.csv")
news4_bo_df.head(100)

Unnamed: 0,alpha,beta,lr_0,n,n_iter,random_state,runtime,seed,val_acc,val_nll,test_acc,test_nll
0,0.007589,0.000678,0.002389,40000.0,0.0,1001.0,14682.265820,0.0,0.934875,0.020004,0.932500,0.020449
1,0.000001,0.000001,0.100000,40000.0,1.0,1001.0,22301.092462,0.0,0.250000,0.129966,0.932500,0.020449
2,0.004011,0.001500,0.000631,40000.0,2.0,1001.0,29777.094771,0.0,0.927000,0.019971,0.922368,0.021701
3,0.002254,0.010000,0.078326,40000.0,3.0,1001.0,37304.995549,0.0,0.911625,0.025204,0.922368,0.021701
4,0.004851,0.000001,0.100000,40000.0,4.0,1001.0,44933.403602,0.0,0.916750,0.025521,0.922368,0.021701
...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.000001,0.000001,0.001470,40000.0,11.0,1001.0,98403.808607,7.0,0.932750,0.019107,0.930000,0.020371
96,0.001360,0.000193,0.000214,40000.0,0.0,1001.0,14907.553491,8.0,0.912125,0.023456,0.908684,0.024769
97,0.000001,0.000001,0.100000,40000.0,1.0,1001.0,22467.267115,8.0,0.250000,0.129970,0.908684,0.024769
98,0.002509,0.001321,0.000134,40000.0,2.0,1001.0,30277.997782,8.0,0.905750,0.025177,0.908684,0.024769
