In [4]:
import sys
import os

import orjson
import ijson
import json
import torch
import numpy as np

from typing import List, Optional
from tqdm import tqdm

sys.path.append(os.path.abspath("..")) # append calibration folder to path for McEval import

In [5]:
data_path = "output/Qwen3-Coder-30B-A3B/Scenario.codegeneration_10_0.2_eval_all.json"
out_path = "output/Qwen3-Coder-30B-A3B/Scenario.codegeneration_10_0.2_eval_all.jsonl"
jsonlines = "output/Qwen3-Coder-30B-A3B/codegeneration_10_0.2_eval_all.jsonl"
evaluation_data_path = "output/Qwen3-Coder-30B-A3B/preprocessed/codegeneration_10_0.2.jsonl"

In [6]:


# convert to jsonl
def clean_logprobs(logprobs_list):
    for sequence in logprobs_list:
        for token in sequence:
            for token_id in token:
                token[token_id]['logprob'] = str(token[token_id]['logprob'])
    return logprobs_list

def convert_to_jsonl(data_path, out_path):
    with open(data_path, "rb") as infile, open(out_path, "wb") as outfile:
        for item in ijson.items(infile, "item"):  # iterate over each object in array
            item['pass@1'] =str(item['pass@1'])
            item['logprobs_list'] = clean_logprobs(item['logprobs_list'])
            line = orjson.dumps(item) + b"\n"
            outfile.write(line)
            
# convert_to_jsonl(data_path, out_path)

    


In [7]:
# Katarinas changes:


def top20_logprob_sum_f(sample: dict):
    top20 = []
    for sequence in sample["logprobs_list"]:
        top20seq = []
        count = 0  # count = Token-Nr. (beginnend mit 0)
        for token in sequence:
            top20seq.append(0)
            for k in token.values():
                if (k["rank"] >= 1) and (k["rank"] <= 20):
                    top20seq[count] += float(k["logprob"])
            count += 1
        top20.append(top20seq)
    return top20


def search(sample: dict):
    # This is tokenizer-dependent
    # Relevant tokens for Qwen3: 73594='```',
    #                            12669='python',
    #                            13874='``',
    #                            19324='` '
    #                            41233='Ġ```Ċ'

    coding_tok_list = []
    for sequence in sample["logprobs_list"]:
        token_list = []

        index = 0
        for token in sequence:
            token_list.append(0)
            for id, k in token.items():
                if k["rank"] == 1:
                    token_list[index] = id
            index += 1

        position = []
        for i in range(0, len(token_list) - 2):
            if token_list[i] == "73594" and token_list[i + 1] == "12669":
                if token_list[i + 2] == "198":
                    position.append(i + 3)
                else:
                    position.append(i + 2)

        if len(position) > 0:
            position = position[
                -1:
            ]  # check this -> needs to handle incomplete last sequences

            for j in range(position[0], len(token_list) - 1):
                if token_list[j] == "73594":
                    position.append(j - 1)
                elif token_list[j] == "13874" and token_list[j + 1] == "19324":
                    position.append(j - 1)

        if len(position) > 2:
            position = [
                position[0]
            ]  # Ignore end position if multiple are found. @Katharina: -> why?

        coding_tok_list.append(position)

    return coding_tok_list


def coding_logprob_f(position_list, logprob_list):
    coding_logprobs = {}
    first_token = []
    first_20_token = []
    last_token = []
    last_20_token = []

    # if len(position_list)==len(logprob_list): #
    assert len(position_list) == len(
        logprob_list
    ), "Position list and logprob list need to be of equal size"
    for seq in range(0, len(logprob_list)):
        seq_logprobs = logprob_list[seq]
        seq_positions = position_list[seq]

        if len(seq_positions) >= 1 and len(seq_positions) < 3:
            first_token.append(seq_logprobs[(seq_positions[0])])

            if (seq_positions[0] + 20) <= (len(seq_logprobs) - 1):
                first_20_token.append(
                    float(
                        np.mean(
                            seq_logprobs[(seq_positions[0]) : (seq_positions[0] + 20)]
                        )
                    )
                )
            else:
                first_20_token.append([])

            if len(seq_positions) == 2:
                last_token.append(seq_logprobs[(seq_positions[1])])
                last_20_token.append(
                    float(
                        np.mean(
                            seq_logprobs[
                                (seq_positions[1] - 19) : (seq_positions[1] + 1)
                            ]
                        )
                    )
                )
            else:
                last_20_token.append([])
                last_token.append([])

        else:
            first_token.append([])
            first_20_token.append([])
            last_20_token.append([])
            last_token.append([])

    coding_logprobs = {
        "first_token": first_token,
        "first_20_token": first_20_token,
        "last_token": last_token,
        "last_20_token": last_20_token,
    }

    return coding_logprobs


def test(sample):
    candidates = ("73594", "12669", "13874", "19324", "41233")

    hit = []
    for sequence in sample["logprobs_list"]:
        hit_seq = []
        index = 0  # count = Token-Nr. (beginnend mit 0)
        for token in sequence:
            for id, k in token.items():
                if (
                    k["rank"] == 1
                    and (k["decoded_token"].find("``") != -1)
                    and id not in candidates
                ):
                    hit_seq.append([id, k["decoded_token"], index])

            index += 1
        if len(hit_seq) > 0:
            hit.extend(hit_seq)
    if len(hit) > 0:
        print(f"Check sample {sample['question_title']}: {hit}")


def prepare_sample(sample: dict):
    logprobs = [
        [
            k["logprob"]
            for d in sequence
            for k in filter(lambda x: x["rank"] == 1, d.values())
        ]
        for sequence in sample["logprobs_list"]
    ]

    code_token_idxs = [extract_code_tokens(seq) for seq in sample["logprobs_list"]]

    logprob_sum = top20_logprob_sum_f(sample)
    average_top20 = [[x / 20 for x in seq] for seq in logprob_sum]
    avg_top20_code_probs = [
        float(np.mean(seq[start:stop]))
        for seq, [start, stop] in zip(average_top20, code_token_idxs)
    ]
    code_logprobs = [
        float(np.mean(np.array(seq[start:stop], dtype=float)))
        for seq, [start, stop] in zip(logprobs, code_token_idxs)
    ]
    avg_top20_tail = [float(np.mean(seq[-40:])) for seq in average_top20]
    tail_logprobs = [float(np.mean(np.array(seq[-40:], dtype=float))) for seq in logprobs]
    # coding_tok_pos = search(sample)
    coding_log_probs = coding_logprob_f(code_token_idxs, logprob_sum)

    # test(sample)

    return {
        "id": sample["question_id"],
        "name": sample["question_title"],
        "prompt": sample["question_content"],
        "program": sample["code_list"],
        "language": "python",
        "output_size": [len(output) for output in sample["output_list"]],
        "token_count": [len(logprobs) for logprobs in sample["logprobs_list"]],
        "is_correct": sample["graded_list"],
        "token_logprobs": logprobs,
        "avg_top20": average_top20,
        "avg_top20_code_probs": avg_top20_code_probs,
        "avg_top20_tail": avg_top20_tail,
        "code_logprob": code_logprobs,
        "tail_logprob": tail_logprobs,
        "cumulative_logprob": [sum([float(x) for x in l]) for l in logprobs],
        "code_token_idx": code_token_idxs,
        "top_logprobs_first_token": [sequence[0] for sequence in logprob_sum],
        "top_logprobs_first_20_tokens": [
            (float(np.mean(sequence[0:20]))) for sequence in logprob_sum
        ],
        "top_logprobs_last_token": [sequence[-1] for sequence in logprob_sum],
        "top_logprobs_last_20_tokens": [
            (float(np.mean(sequence[(len(sequence) - 20) : (len(sequence))])))
            for sequence in logprob_sum
        ],
        "top_logprobs_first_coding_token": coding_log_probs["first_token"],
        "top_logprobs_first_20_coding_tokens": coding_log_probs["first_20_token"],
        "top_logprobs_last_coding_tokens": coding_log_probs["last_token"],
        "top_logprobs_last_20_coding_tokens": coding_log_probs["last_20_token"],
        "difficulty": sample["difficulty"],
    }

In [8]:

def is_rank(prob_entry:dict, rank:Optional[int]=1) -> bool:
    return prob_entry['rank']==rank

def extract_code_tokens(probs_list:List[dict]) -> List[dict]:
    """Extract tokens and probabilities for the last ```python ... ``` block.
        param probs_list: list of token probs for 1 generation
    """
    
    top_tokens = [x for entry in probs_list for x in filter(is_rank, entry.values())]
    tokens = [t['decoded_token'] for t in top_tokens]
    
    # Join tokens into a single string to find indices
    joined = "".join(tokens)
    start_marker = "```python"
    end_marker = "```"

    # Find last start and end indices in the joined string
    start_idx = joined.rfind(start_marker)
    if start_idx == -1:
        return [0,0] # None, None, []

    end_idx = joined.find(end_marker, start_idx + len(start_marker))
    if end_idx == -1:
        return [0,0] # start_idx, None, []

    # Map character indices back to token indices
    cumulative = 0
    start_token, end_token = None, None
    for i, tok in enumerate(tokens):
        if start_token is None and cumulative >= start_idx:
            start_token = i
        if cumulative >= end_idx:
            end_token = i
            break
        cumulative += len(tok)

    # Slice token/probability lists
    if start_token is not None and end_token is not None:
        return [start_token, end_token] #, top_tokens[start_token:end_token]
    else:
        return [0,0] #None, None, []
    


In [9]:
def preprocess_data(input_path, output_path, max_samples=None):
    i = 0
    with open(input_path, "rb") as infile, open(output_path, "wb") as outfile:
        for line in tqdm(infile):
            if max_samples and i>=max_samples:
                break
            obj = orjson.loads(line)
            sample = prepare_sample(obj)
            line = orjson.dumps(sample) + b"\n"
            outfile.write(line)
            i += 1



In [None]:
# Full preprocessing pipeline for LiveCodeBench

# Qwen3-Coder
in_path_qwen3 = "output/Qwen3-Coder-30B-A3B/Scenario.codegeneration_10_0.2_eval_all.json"
jsonl_path_qwen3 = "output/Qwen3-Coder-30B-A3B/Scenario.codegeneration_10_0.2_eval_all.jsonl"
out_path_qwen3 = "output/Qwen3-Coder-30B-A3B/preprocessed/codegeneration_10_0.2.jsonl"

# gpt-oss
in_path_gpt_oss = "output/GPT-OSS-20B/Scenario.codegeneration_10_0.2_eval_all.json"
jsonl_path_gpt_oss = "output/GPT-OSS-20B/codegeneration_10_0.2_eval_all.jsonl"
out_path_gpt_oss = "output/GPT-OSS-20B/preprocessed/codegeneration_10_0.2.jsonl"


def prepare_lcb_for_evaluation(in_path, jsonl_path, out_path):
    convert_to_jsonl(in_path, jsonl_path)
    preprocess_data(jsonl_path, out_path)
    
# prepare_lcb_for_evaluation(in_path=in_path_gpt_oss,
#                            jsonl_path=jsonl_path_gpt_oss,
#                            out_path=out_path_gpt_oss)

# prepare_lcb_for_evaluation(in_path=in_path_qwen3,
#                            jsonl_path=jsonl_path_qwen3,
#                            out_path=out_path_qwen3)

preprocess_data(jsonl_path_gpt_oss, out_path_gpt_oss) #, max_samples=10) 

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
10it [02:07, 12.79s/it]


### Groups
- loc
- cyclomatic complexity
- len output
- len code
- difficulty
- len prompt
- language

(identifier count, etc)

### Metrics
- mean logprob
@ Katharina:
- sum top-20 logprobs (last n tokens, last token, last code token, first token)
     1) last 20 tokens (Konfigurierbar für n)
     2) last token
     3) first 20 tokens
     4) last 20 code tokens
     5) last code token
     6) first code token