In [1]:
import os, psutil, gc
import time 
import json
import pprint

from collections import defaultdict
import random

import numpy as np
np.set_printoptions(precision=4)
from scipy.stats import ttest_rel

In [2]:
from sal.config import Config

from core import best_of_n
from utils.load_data import load_data_prm800k
from utils import grader 

In [3]:
# base_dir
base_dir = '/groups/kjun/tnn/datasets/'

# dataset path
data_dir = base_dir + "/prm800k/math_splits"

# llm and prm path
llm_dir = base_dir + "/Llama-3.2-1B-Instruct-GGUF/Llama-3.2-1B-Instruct.Q4_K_M.gguf"
prm_dir = base_dir + "/Llama3.1-8B-PRM-Deepseek-Data-GGUF/Llama3.1-8B-PRM-Deepseek-Data.Q4_K_M.gguf"

llm_tokenizer_dir = base_dir + "/Llama-3.2-1B-Instruct"
prm_tokenizer_dir = base_dir + "/Llama3.1-8B-PRM-Deepseek-Data"

In [4]:
#  load data 
data_by_levels = load_data_prm800k(data_dir)


# ds_completions = load_completions(completions_dir)

# load random_seeds     
# random_seeds = np.loadtxt("random_seeds.txt").astype("int64")
# random_seeds = [int(seed) for seed in random_seeds]

1: 43
2: 90
3: 105
4: 128
5: 134


In [7]:
def evaluate_correctness(data_dir, data_by_levels, num_trials, num_budgets=None):
    with open(data_dir, 'r', encoding='utf-8') as fin:
        # all_correctness = []
        all_correctness = np.zeros((num_trials, len(data_by_levels)))
        gt_answers = []
        trial_idx = 0
        for line in fin:
            if trial_idx >= num_trials:
                break
                
            trial_data = json.loads(line)
            for q_idx in range(len(data_by_levels)):
                if num_budgets is not None:
                    completions = trial_data['completions'][q_idx][:num_budgets]
                else:
                    completions = trial_data['completions'][q_idx]
            
                gt_answer = data_by_levels[q_idx]['answer']
                is_correct = False
                for cidx, completion in enumerate(completions):
                    print(completion)
                    stop
                    c_answer = grader.extract_last_boxed_answer(completion)
                    if grader.grade_answer(c_answer, gt_answer):
                        is_correct = True
                        break

                all_correctness[trial_idx][q_idx] = is_correct
                # all_correctness.append(is_correct)

            trial_idx += 1
        
    return all_correctness


# general params
config = Config()
config.agg_strategy = 'last'
config.n = 8
config.beam_width = 4
config.lookahead = 0
config.num_iterations = 1
config.sort_completed = False

# diverse_select params
config.lam = 10
config.normalize_embeds = True

level = '4'
num_questions = len(data_by_levels[level])
# num_questions = 1
num_trials = 20
print(f"num_questions = {num_questions}")

bon_dir = "results/generate_bon_prm800k_level4_n16_v11.jsonl" 
sd_dir = f"results/generate_sd_prm800k_level{level}_n{config.n}_bw{config.beam_width}_depth{config.num_iterations}_lam{config.lam}_v11.jsonl"
print(sd_dir)

# bon_correctness = \
#     evaluate_correctness(bon_dir, data_by_levels[level], num_trials, config.n)
sd_correctness = \
    evaluate_correctness(sd_dir, data_by_levels[level], num_trials)

print(bon_correctness.shape)
print(sd_correctness.shape)
bon_correctness_per_question = np.mean(bon_correctness, axis=0)
sd_correctness_per_question = np.mean(sd_correctness, axis=0)
print(bon_correctness_per_question.shape)

print(f"bon_score = {np.mean(bon_correctness)}")
print(f"sd_score = {np.mean(sd_correctness)}")
print(f"bon_score_per_question = {np.mean(bon_correctness_per_question)}")
print(f"sd_score_per_question = {np.mean(sd_correctness_per_question)}")
print(sd_correctness_per_question[:10])
print(bon_correctness_per_question[:10])


# print(bon_correctness)
# print(bon_best_correctness)
# num_differences = np.sum(np.array(bon_correctness) != np.array(bon_best_correctness))
# print(num_differences)
# print(np.sum(sd_correctness != sd_best_correctness))

t_stat, p_value = ttest_rel(bon_correctness.flatten().astype(int), sd_correctness.flatten().astype(int))
print(f"t_stat = {t_stat:0.4f}")
print(f"p_value = {p_value:0.4f}")

t_stat, p_value = ttest_rel(bon_correctness_per_question, sd_correctness_per_question)
# t_stat, p_value = ttest_rel(np.array(bon_best_correctness).astype(int), np.array(sd_best_correctness).astype(int))
print(f"t_stat = {t_stat:0.4f}")
print(f"p_value = {p_value:0.4f}")

num_questions = 128
results/generate_sd_prm800k_level4_n8_bw4_depth1_lam10_v11.jsonl
## Step 1: To find the angle between two lines, we can use the dot product formula derived from the cross product of two vectors representing the lines.

We are given two lines defined by the equations $2x = 3y = -z$ and $6x = -y = -4z$.

## Step 2: First, we need to express these equations in vector form to represent the lines. We can do this by defining $\mathbf{i}$, $\mathbf{j}$, and $\mathbf{k}$ as the standard basis vectors.

Let $\mathbf{v}_1 = (2, 3, -1)$ be the direction vector of the first line, and $\mathbf{v}_2 = (6, -1, -4)$ be the direction vector of the second line.

## Step 3: We can now use the dot product formula to find the angle between the two lines.

The dot product formula is given by $\mathbf{u} \cdot \mathbf{v} = |\mathbf{u}| |\mathbf{v}| \cos(\theta)$, where $\theta$ is the angle between the vectors $\mathbf{u}$ and $\mathbf{v}$.

## Step 4: Let's calculate the dot product of $

NameError: name 'stop' is not defined