In [39]:
import json 

In [40]:
with open("/home/xiusic/RM-Bench/results/Gen_RMs/unhacked_flexible_step33/total_dataset_global_step33_hf_REWORD_MODEL_20250318_172310.json") as json_file:
    data = json.load(json_file)

In [41]:
import numpy as np 
from typing import List, Dict, Any


def split_dataset_by_domain(dataset: List[Dict[str, Any]]) -> Dict[str, List[Dict[str, Any]]]:
    domains = ["chat","math","code","safety"]
    domain_dataset_dict = {}
    for domain in domains:
        domain_dataset_dict[domain] = [example for example in dataset if example['domain'].startswith(domain)]
    
    # pop the domain keys
    for domain in domain_dataset_dict:
        for example in domain_dataset_dict[domain]:
            example.pop('domain')
    
    return domain_dataset_dict


def compute_accuracy_gen(results: List[Dict[str, Any]]) -> Dict[str, float]:
    if 'domain' in results[0]:
        # this indicates this is total_dataset.json
        print('We are handling total_dataset.json')
        print('Splitting the dataset by domain...')
        # thus we need to split the results into different domains
        split_results = split_dataset_by_domain(results)
        domain_results = {}
        for domain in split_results:
            domain_results[domain] = compute_accuracy_gen(split_results[domain])
        domain_avg_results = {}
        for domain in domain_results:
            domain_avg_results[domain] = np.mean(list(domain_results[domain].values()))
        domain_hard_normal_easy_acc = {
            "hard_acc": np.mean([domain_results[domain]["hard_acc"] for domain in domain_results]),
            "normal_acc": np.mean([domain_results[domain]["normal_acc"] for domain in domain_results]),
            "easy_acc": np.mean([domain_results[domain]["easy_acc"] for domain in domain_results])
        }
        total_avg_acc = np.mean([domain_avg_results[domain] for domain in domain_avg_results])
        # merge the results into one falten dictionary
        final_results = {}
        # merge domain_avg_results into final_results
        final_results.update(domain_avg_results)
        # merge domain_hard_normal_easy_acc into final_results
        final_results.update(domain_hard_normal_easy_acc)
        # merge total_avg_acc into final_results
        final_results.update({"total_avg_acc": total_avg_acc})
        return final_results
            
    
    # results is a list of dictionaries, each dictionary contains the following keys:
    # score_chosen: [float, float, float], the scores of the chosen responses
    # score_rejected: [float, float, float], the scores of the rejected responses
    # the scores are in the order of [concise, detailed_plain, detailed_markdown]
    # we will compare the scores of chosen responses and rejected responses iteratively
    # formatted as a 3x3 matrix, where the rows represent the scores of chosen responses
    # and the columns represent the scores of rejected responses
    MATRIX_SIZE = 3 # the column and row size of the matrix
    acc_matrix = np.zeros((MATRIX_SIZE, MATRIX_SIZE))
    for result in results:
        for i in range(len(result["result"])):
            for j in range(len(result["result"])):
                if result["result"][i] == 1:
                    acc_matrix[i][j] += 1
    
    # compute the accuracy by dividing the number of correct comparisons by the total number of comparisons
    acc_matrix /= len(results)
    # compute the hard,normal,easy accuracy
    # hard accuracy: the average of the upper-right triangle of the matrix
    # namely chosen responses with less fancy style compared to rejected responses with more fancy style
    upper_right_count = MATRIX_SIZE * (MATRIX_SIZE - 1) / 2
    hard_acc = np.sum(np.triu(acc_matrix, 1)) / upper_right_count
    # normal accuracy: the average of the diagonal of the matrix
    # namely chosen responses with the same style compared to rejected responses with the same style
    normal_acc = np.mean(np.diag(acc_matrix))
    # easy accuracy: the average of the lower-left triangle of the matrix
    # namely chosen responses with more fancy style compared to rejected responses with less fancy style
    lower_left_count = MATRIX_SIZE * (MATRIX_SIZE - 1) / 2
    easy_acc = np.sum(np.tril(acc_matrix, -1)) / lower_left_count
    
    return {
        "hard_acc": hard_acc,
        "normal_acc": normal_acc,
        "easy_acc": easy_acc
    }

In [42]:
compute_accuracy_gen(data)

We are handling total_dataset.json
Splitting the dataset by domain...


{'chat': 0.6563307493540051,
 'math': 0.5803402646502835,
 'code': 0.5014619883040936,
 'safety': 0.9342403628117913,
 'hard_acc': 0.6730477505536333,
 'normal_acc': 0.6680933412800434,
 'easy_acc': 0.6631389320064536,
 'total_avg_acc': 0.6680933412800434}

In [43]:
easy, normal, hard, total = 0, 0, 0, 0
for item in data:
    res = item['result']
    if res[0] == 1:
        easy += 1 
    if res[1] == 1:
        normal += 1 
    if res[2] == 1:
        hard += 1 
    total += 1

In [44]:
easy, normal, hard, total

(917, 919, 918, 1327)

In [45]:
917/1327

0.6910324039186134

In [48]:
(917 + 919 + 918) / (1327 * 3)

0.691785983421251