In [1]:
import sys
import os

import json
import torch
import numpy as np

from typing import List
from tqdm import tqdm

sys.path.append(os.path.abspath("..")) # append calibration folder to path for McEval import

In [6]:
# in
path_mceval = "/data/tyler/dok/viola/exchange/robin/results/Qwen/Qwen2.5-Coder-32B-Instruct/results.jsonl"
path_mceval_logprobs = "/data/tyler/dok/viola/exchange/robin/results/Qwen/Qwen2.5-Coder-32B-Instruct/token_scores.pt"
path_mceval_evaluation = "/data/tyler/dok/viola/exchange/robin/results/Qwen/Qwen2.5-Coder-32B-Instruct/evaluation/details.jsonl"
# out
path_mceval_processed = "output/McEval/Qwen2.5-Coder-32B-Instruct/preprocessed/mceval_results.jsonl"

In [2]:
from McEval.eval.extract import extract

def prepare_mceval_sample(sample: dict):
    language = sample['task_id'].split('/')[0]
    id = sample["task_id"] + "_" + str(sample["generation"])
    try:
        code = extract(sample["raw_generation"][0], sample, language)
    except:
        print(f'+++++ Extract {sample["task_id"]} failed')
        code = ""
    return {
        "id": id,
        "name": sample["entry_point"],
        "prompt": sample["instruction"],
        "program": code,
        "language": language,
        "output_size": len(sample["raw_generation"][0]),
        "difficulty": sample["level"],
    }

In [7]:
def process_mceval_evaluation(input_path=path_mceval_evaluation, max_samples=None):
    i = 0
    results = {}
    with open(input_path, "r") as f:   
        for line in f:
            if max_samples and i>=max_samples:
                break
             # strip leading junk before first { or [
            if "{" in line or "[" in line:
                i = min(
                    [idx for idx in (line.find("{"), line.find("[")) if idx != -1]
                )
                line = line[i:]
            chunk = json.loads(line)
            for evaluation in chunk:
                results[evaluation['task_id'] + "_" + str(evaluation['generation'])] = evaluation['pass']
       
            i+=1
    return results

evaluation_results = process_mceval_evaluation(max_samples=10)

In [8]:
# Candidates for code slicing: 73597, 13874, 19324

STOP_TOKENS = ['<|endoftext|>', '<|im_end|>' ]

def clean_logprob_sequence(logprobs_list: List[List]) -> np.ndarray:
    # trim & convert to np array
    # Trim at first stop token or first -inf prob. TODO check again what's wrong here
    stop_candidates = ([i for i, (_, tok, _) in enumerate(logprobs_list) if tok in STOP_TOKENS] 
                       + [i for i, (_, _, prob) in enumerate(logprobs_list) if prob == float('-inf')])
    if len(stop_candidates) > 0:
        idx = min(stop_candidates)
    else:
        idx = len(logprobs_list) 
    
    result = np.array([prob for _, _, prob in logprobs_list[:idx]])
    # clean remaining -inf values
    #result = np.where(np.isneginf(result), -10000, result) #check this!
    return result


In [9]:
# load logprobs
logprob_data = torch.load(path_mceval_logprobs, map_location="cpu")

In [11]:
broken_languages = ['AWK', 'C#', 'Erlang', 'F#', 'Visual Basic', 'Markdown'] 
# AWK -> shady evaluation, 
# Markdown -> neginf probs, 
# rest: buggy evaluation

def get_logprob_info_for(sample_id: str, logprob_data: dict):
    name, generation = sample_id.rsplit("_", 1)
    generation = int(generation)
    if name in logprob_data and generation-1 < len(logprob_data[name]):
        logprobs = clean_logprob_sequence(logprob_data[name][generation])
        
        token_count = len(logprobs)
        cumulative_logprob = sum([float(x) for x in logprobs])
        return logprobs.tolist(), token_count, cumulative_logprob
    else:
        raise ValueError(f"Logprob data not found for sample id {sample_id}")


def process_mceval_results(evaluation_results: dict,
                           logprob_data: dict,
                           input_path: str=path_mceval, 
                           output_path: str=path_mceval_processed,
                           max_samples: int=None):
    i = 0
    broken = {l:0 for l in broken_languages}
    results = []
    with open(input_path, "r") as infile, open(output_path, "w") as outfile:   
        for line in infile:
            if max_samples and i>=max_samples:
                break
            # strip leading junk before first { or [
            if "{" in line or "[" in line:
                j = min(
                    [idx for idx in (line.find("{"), line.find("[")) if idx != -1]
                )
                line = line[j:]
            mceval_sample = json.loads(line)
            if mceval_sample['task_id'].split('/')[0] not in broken_languages:
                sample = prepare_mceval_sample(mceval_sample)
                sample['is_correct'] = evaluation_results.get(sample['id'], False)
                logprobs, token_count, cumulative_logprob = get_logprob_info_for(sample['id'], logprob_data)
                sample["token_count"] = token_count
                sample["token_logprobs"] = logprobs
                sample['cumulative_logprob'] = cumulative_logprob
                
                # DEBUG
                if (token_count == 0
                    or np.exp(cumulative_logprob / token_count) == 0.0):
                    print('check me')
                
                
                
                results.append(sample)
                outfile.write(json.dumps(sample) + "\n")
                i+=1
            else: broken[mceval_sample['task_id'].split('/')[0]] +=1
    print(broken)
    return results

results = process_mceval_results(evaluation_results, logprob_data, output_path="/dev/null", max_samples=20)

{'AWK': 0, 'C#': 0, 'Erlang': 0, 'F#': 0, 'Visual Basic': 0, 'Markdown': 0}
