In [1]:
import json
from eval import accuracy, ece

### Evaluate the performance of vPGM on ScienceQA test set

In [3]:
# evaluate vPGM
log_path = '../results/my_scienceqa/test_2563/vpgm-n3_Meta-Llama-3-8B-Instruct_run-0.json'
gt_answers = []
lm_answers = []
lm_probabilities = []
for log in json.load(open(log_path)):
    pid = log['pid']
    gt_answers.append(log['gt_answer'])
    lm_answer_dict = log['lm_answer']
    # pick the answer with the highest probability in the lm_answer_dict
    lm_answer = max(lm_answer_dict, key=lm_answer_dict.get)
    lm_answers.append(lm_answer)
    lm_probabilities.append(lm_answer_dict[lm_answer])
print(f'Accuracy: {accuracy(gt_answers, lm_answers)}')
print(f'ECE: {ece(gt_answers, lm_answers, lm_probabilities)}')

Accuracy: 0.8638314475224347
ECE: 0.016665039668960494


### Train/Evaluate BayesVPGM on the ScienceQA validation/test set

In [4]:
# code for BayesVPGM
import sys
sys.path.append('../')
from engine.bayesvpgm import pi_k_model, ClassficationAndMDCA
import torch
import numpy as np
import json

vpgm_logs_path = '../results/my_scienceqa/test_2563/vpgm-n3_Meta-Llama-3-8B-Instruct_run-0.json'
train = False
with open(vpgm_logs_path) as f:
    vpgm_logs = json.load(f)
max_num_category = 5  # 5 for ScienceQA
    
# Create tensor dataset
pis = []
tags = []
gt_answers = []
lm_answers = []
lm_probability = []
nks = torch.zeros(max_num_category)
for log in vpgm_logs:
    prob_dict = log['lm_answer']
    # get the answer with the highest probability
    letter_answer = max(prob_dict, key=prob_dict.get)
    idx_answer = ord(letter_answer) - ord('A')
    # list of values in prob_dict
    lm_probabilities = list(prob_dict.values())
    
    # append 0 to _bcot_option_prob_dict until the length is max_num_category
    while len(lm_probabilities) < max_num_category:
        lm_probabilities.append(0)
    # Append _bcot_option_prob_dict to pis
    pis.append(lm_probabilities)
    
    # map dp['gt_answer'] from char to int (A: 0, B: 1, ...) and append to gt_answers
    gt_answers.append(ord(log['gt_answer']) - ord('A'))
    nks[idx_answer] += 1
    lm_answers.append(idx_answer)
    lm_probability.append(prob_dict[letter_answer])
    # Append the length of the bcot_option_prob_dict to tags
    tags.append(len(lm_probabilities))
# Convert to tensors and numpy arrays
pis_tensor = torch.tensor(pis, dtype=torch.float32)
gt_answers_tensor = torch.tensor(gt_answers, dtype=torch.long)
tags_tensor = torch.tensor(tags, dtype=torch.long)
pis_np = np.array(pis)
gt_answers_np = np.array(gt_answers)
lm_answers_np = np.array(lm_answers)

model = pi_k_model(nks=nks, max_num_category=max_num_category)  # 5 for ScienceQA
if train:
    optimizer = torch.optim.LBFGS(model.parameters(), lr=0.00000000001, max_iter=100)
    criterion = ClassficationAndMDCA()
    # Train
    model.train()
    # Training
    def closure():
        optimizer.zero_grad()
        pis_map_mean = model(pis_tensor, tags)
        loss = criterion(pis_map_mean, gt_answers_tensor)
        loss.backward()
        return loss
    print(f"Lambda before optimization: {model.lamb.item()}")
    optimizer.step(closure)
    print(f"Lambda after optimization: {model.lamb.item()}")
else:
    model.eval()
    model.lamb = torch.nn.Parameter(torch.ones(1) * 6.999990091571817e-06)

# Calculate accuracy and ECE with pis_np, gt_answers_np, selected_answers_np before optimization
# transform int to char (0: A, 1: B, ...)
lm_answers = [chr(ord('A') + i) for i in lm_answers_np]
gt_answers = [chr(ord('A') + i) for i in gt_answers_np]
print(f"Accuracy before optimization on val: {accuracy(gt_answers, lm_answers)}")
print(f"ECE before optimization on val: {ece(gt_answers, lm_answers, lm_probability)}")
# Calculate accuracy and ECE after optimization
pis_map_mean = model(pis_tensor, tags)
lm_answers_after_optimization_idx = torch.argmax(pis_map_mean, dim=1).numpy()
# transform int to char (0: A, 1: B, ...)
lm_answers_after_optimization = [chr(ord('A') + i) for i in lm_answers_after_optimization_idx]
lm_probabilities_after_optimization = pis_map_mean.detach().numpy()
# pick the probability of the selected answer idx
lm_probability_after_optimization = [lm_probabilities_after_optimization[i][lm_answers_after_optimization_idx[i]] for i in range(len(lm_answers_after_optimization))]
print(f"Accuracy after optimization: {accuracy(gt_answers, lm_answers_after_optimization)}")
print(f"ECE after optimization: {ece(gt_answers, lm_answers_after_optimization, lm_probability_after_optimization)}")

Accuracy before optimization on val: 0.8638314475224347
ECE before optimization on val: 0.016665039668960494
Accuracy after optimization: 0.8638314475224347
ECE after optimization: 0.010480154417615075
