In [1]:
import sys
import os

import json
import torch
import numpy as np

from typing import List
from tqdm import tqdm
from pathlib import Path

sys.path.append(os.path.abspath("..")) # append calibration folder to path for McEval import

In [2]:
# in
path_mceval = "/data/tyler/dok/viola/exchange/robin/results/Qwen/Qwen2.5-Coder-32B-Instruct/results.jsonl"
path_mceval_logprobs = "/data/tyler/dok/viola/exchange/robin/results/Qwen/Qwen2.5-Coder-32B-Instruct/token_scores.pt"
path_mceval_evaluation = "/data/tyler/dok/viola/exchange/robin/results/Qwen/Qwen2.5-Coder-32B-Instruct/evaluation/details.jsonl"
# out
path_mceval_processed = "output/McEval/Qwen2.5-Coder-32B-Instruct/preprocessed/mceval_results.jsonl"

In [37]:
path_gpt = '/data/tyler/dok/viola/exchange/robin/results/mceval/gpt-oss-20b/gpt-oss-20b_generation.jsonl'
gpt_logprobs_dir = '/data/tyler/dok/viola/exchange/robin/results/mceval/gpt-oss-20b/gpt-oss-20b_generation'
gpt_eval = '/data/tyler/dok/viola/exchange/robin/results/mceval/gpt-oss-20b/detail.jsonl'
gpt_processed = "output/McEval/GPT-OSS-20B/preprocessed/mc_eval_results.jsonl"

In [39]:
path_qwen3 = '/data/tyler/dok/viola/exchange/robin/results/mceval/Qwen3-Coder-30B-A3B-Instruct/Qwen3-Coder-30B-A3B-Instruct_generation.jsonl'
qwen3_logprobs_dir = '/data/tyler/dok/viola/exchange/robin/results/mceval/Qwen3-Coder-30B-A3B-Instruct/Qwen3-Coder-30B-A3B-Instruct_generation'
qwen3_eval = '/data/tyler/dok/viola/exchange/robin/results/mceval/Qwen3-Coder-30B-A3B-Instruct/detail.jsonl'
qwen3_processed = "output/McEval/Qwen3-Coder-30B-A3B-Instruct/preprocessed/mc_eval_results.jsonl"

In [4]:
from McEval.eval.extract import extract

def prepare_mceval_sample(sample: dict):
    language = sample['task_id'].split('/')[0]
    id = sample["task_id"] + "_" + str(sample["generation"])
    try:
        code = extract(sample["raw_generation"][0], sample, language)
    except:
        print(f'+++++ Extract {sample["task_id"]} failed')
        code = ""
    return {
        "id": id,
        "name": sample["entry_point"],
        "prompt": sample["instruction"],
        "program": code,
        "language": language,
        "output_size": len(sample["raw_generation"][0]),
        "difficulty": sample["level"],
    }

In [5]:
def process_mceval_evaluation(input_path=path_mceval_evaluation, max_samples=None):
    i = 0
    results = {}
    with open(input_path, "r") as f:   
        for line in f:
            if max_samples and i>=max_samples:
                break
             # strip leading junk before first { or [
            if "{" in line or "[" in line:
                i = min(
                    [idx for idx in (line.find("{"), line.find("[")) if idx != -1]
                )
                line = line[i:]
            chunk = json.loads(line)
            for evaluation in chunk:
                results[evaluation['task_id'] + "_" + str(evaluation['generation'])] = evaluation['pass']
       
            i+=1
    return results

evaluation_results = process_mceval_evaluation(input_path=gpt_eval, max_samples=10)

In [None]:
# Candidates for code slicing: 73597, 13874, 19324

STOP_TOKENS = ['<|endoftext|>', '<|im_end|>' ]

def clean_logprob_sequence(logprobs_list: List[dict]) -> np.ndarray:
    [logp for sublist in logprobs_list['vocab'] for (logp,rank,tok) in sublist.values()  if rank==1]
    # trim & convert to np array
    # Trim at first stop token or first -inf prob. TODO check again what's wrong here
    stop_candidates = ([i for i, (_, tok, _) in enumerate(logprobs_list) if tok in STOP_TOKENS] 
                       + [i for i, (_, _, prob) in enumerate(logprobs_list) if prob == float('-inf')])
    if len(stop_candidates) > 0:
        idx = min(stop_candidates)
    else:
        idx = len(logprobs_list) 
    
    result = np.array([prob for _, _, prob in logprobs_list[:idx]])
    # clean remaining -inf values
    #result = np.where(np.isneginf(result), -10000, result) #check this!
    return result


In [52]:
broken = ['C#.jsonl', 'Visual Basic.jsonl', 'Erlang.jsonl', 'F#.jsonl'] # definitely broken evaluation

def load_logprob_data(logprob_dir:str)-> dict:
    logprob_data = {}
    n = 0
    pbar = tqdm(Path(logprob_dir).iterdir())
    for file in pbar:
        if file.name not in broken:
            with open(file, "r") as f:   
                language = file.stem
               
                pbar.set_description(f"Language: {language}")
                #logprob_data[language] = []
                for line in f:
                    
                    # strip leading junk before first { or [
                    if "{" in line or "[" in line:
                        i = min(
                            [idx for idx in (line.find("{"), line.find("[")) if idx != -1]
                        )
                        line = line[i:]
                    chunk = json.loads(line)
                    if chunk['task_id'] not in logprob_data:
                        
                        logprob_data[chunk['task_id']] = [None] * 10

                    logprob_data[chunk['task_id']][chunk['generation']] = chunk
                    
                    n += 1
                    if n > 10:
                        break
    return logprob_data                
    
logprob_data = load_logprob_data(gpt_logprobs_dir)

Language: Scala: : 40it [00:00, 70.51it/s]       


In [None]:
broken_languages = ['AWK', 'C#', 'Erlang', 'F#', 'Visual Basic', 'Markdown'] 
# AWK -> shady evaluation, 
# Markdown -> neginf probs, 
# rest: buggy evaluation

# def get_logprob_info_for_old(sample_id: str, logprob_data: dict):
#     name, generation = sample_id.rsplit("_", 1)
#     generation = int(generation)
#     if name in logprob_data and generation-1 < len(logprob_data[name]):
#         logprobs = clean_logprob_sequence(logprob_data[name][generation])
        
#         token_count = len(logprobs)
#         cumulative_logprob = sum([float(x) for x in logprobs])
#         return logprobs.tolist(), token_count, cumulative_logprob
#     else:
#         raise ValueError(f"Logprob data not found for sample id {sample_id}")


def get_logprob_info_for(sample_id: str, logprob_data: dict):
    name, generation = sample_id.rsplit("_", 1)
    generation = int(generation)
    if name in logprob_data and logprob_data[name][generation] != None:
        logprobs = [logp for sublist in logprob_data[name][generation]['vocab'] 
                    for (logp,rank,tok) in sublist.values()  
                    if rank==1]
        
        token_count = len(logprob_data[name][generation]['token_ids'])
        cumulative_logprob = sum(logprobs)
        return logprobs, token_count, cumulative_logprob
    else:
        raise ValueError(f"Logprob data not found for sample id {sample_id}")


def process_mceval_results(evaluation_results: dict,
                           logprob_data: dict,
                           input_path: str=path_mceval, 
                           output_path: str=path_mceval_processed,
                           max_samples: int=None):
    i = 0
    broken = {l:0 for l in broken_languages}
    results = []
    with open(input_path, "r") as infile, open(output_path, "w") as outfile:   
        for line in infile:
            if max_samples and i>=max_samples:
                break
            # strip leading junk before first { or [
            if "{" in line or "[" in line:
                j = min(
                    [idx for idx in (line.find("{"), line.find("[")) if idx != -1]
                )
                line = line[j:]
            mceval_sample = json.loads(line)
            if mceval_sample['task_id'].split('/')[0] not in broken_languages:
                sample = prepare_mceval_sample(mceval_sample)
                sample['is_correct'] = evaluation_results.get(sample['id'], False)
                logprobs, token_count, cumulative_logprob = get_logprob_info_for(sample['id'], logprob_data)
                sample["token_count"] = token_count
                sample["token_logprobs"] = logprobs
                sample['cumulative_logprob'] = cumulative_logprob
                sample['raw_generation'] = mceval_sample['raw_generation']
                
                # DEBUG
                if (token_count == 0
                    or np.exp(cumulative_logprob / token_count) == 0.0):
                    print('check me')
                
                
                
                results.append(sample)
                outfile.write(json.dumps(sample) + "\n")
                i+=1
            else: broken[mceval_sample['task_id'].split('/')[0]] +=1
    print(broken)
    return results

results = process_mceval_results(evaluation_results, 
                                 logprob_data, 
                                 input_path=path_gpt,
                                 output_path=gpt_processed, 
                                )
#results = process_mceval_results(evaluation_results, logprob_data, output_path="/dev/null", max_samples=20)

In [46]:
path_r1 = '/data/tyler/dok/viola/exchange/robin/results/mceval/DeepSeek-R1-Distill-Qwen-32B/DeepSeek-R1-Distill-Qwen-32B_generation.jsonl'
r1_logprobs_dir = '/data/tyler/dok/viola/exchange/robin/results/mceval/DeepSeek-R1-Distill-Qwen-32B/DeepSeek-R1-Distill-Qwen-32B_generation'
r1_eval = '/data/tyler/dok/viola/exchange/robin/results/mceval/DeepSeek-R1-Distill-Qwen-32B/detail.jsonl'
r1_processed = "output/McEval/DeepSeek-R1-Distill-Qwen-32B/preprocessed/mc_eval_results.jsonl"

In [50]:
def preprocess(data_path:str, evaluation_path:str, logprob_path:str, output_path:str):
    evaluation_results = process_mceval_evaluation(input_path=evaluation_path)
    print('evaluation done.')
    logprob_data = load_logprob_data(logprob_path)
    
    process_mceval_results(evaluation_results, 
                                 logprob_data, 
                                 input_path=data_path,
                                 output_path=output_path, 
                                )
    
#preprocess(path_qwen3, qwen3_eval, qwen3_logprobs_dir, qwen3_processed)
#preprocess(path_r1, r1_eval, r1_logprobs_dir, r1_processed)
preprocess(path_gpt, gpt_eval, gpt_logprobs_dir, gpt_processed)

evaluation done.


Language: Scala: : 40it [17:23, 26.09s/it]       


['logical function has_close_elements(numbers, size, threshold)\n    !------------------------------------------------------------------\n    !  Arguments\n    !------------------------------------------------------------------\n    real,    intent(in) :: numbers(size)   ! input array\n    integer, intent(in) :: size            ! number of elements\n    real,    intent(in) :: threshold       ! distance threshold\n\n    !------------------------------------------------------------------\n    !  Local variables\n    !------------------------------------------------------------------\n    integer :: i, j\n    real    :: dist\n\n    has_close_elements = .false.           ! assume "no close pair"\n\n    do i = 1, size-1\n      do j = i+1, size\n        dist = abs(numbers(j) - numbers(i))\n        if (dist < threshold) then\n          has_close_elements = .true.\n          return                       ! early exit\n        end if\n      end do\n    end do\n\n  end function has_close_elements