In [1]:
import json
import numpy as np

top_k = 200
instances_num = 23107
scores_raw_all = np.zeros((instances_num, top_k))
codes_retrieved_all = [] # list of instances_num lists, each list has top_k items
codes_ground_truth_all = [] # list of instances_num code strings
row = 0
for file in range(1, 25):
    with open("output_ret/retrieval_chunk_" + str(file) + "_top200_result.json") as f:
        data = json.load(f)
        for i in range(len(data)):
            codes_ground_truth_all.append(data[i]["answers"])
            scores_raw_all[row] = np.array([data[i]["ctxs"][j]["score"] for j in range(top_k)])
            codes_retrieved_all.append([data[i]["ctxs"][j]["text"] for j in range(top_k)])
            row += 1

# data has 1000 items, 
# for each item "data[i]" i in [0, 999], there are associated "question" (code prompt, str), "answers" (ground truth code, str), and a "ctxs"
# each "ctxs" is a list of dictionaries "data[i]["ctxs"][j]" j in [0, 199], each dictionary has 5 keys:
# ['id' (path, str), 'title' (str), 'text' (retrieved code, str), 'score' (similarity, float), 'has_answer' (bool)]

# check for json file loading
# print(0 in scores_raw_all) # False
# for i in range(len(codes_retrieved_all)): assert len(codes_retrieved_all[i]) == top_k
assert row == len(codes_retrieved_all) == len(codes_ground_truth_all) == instances_num

In [2]:
exact_matches = 0
for i in range(instances_num):
    # Find the index of the top raw score for instance i
    best_index = np.argmax(scores_raw_all[i])
    # Compare the retrieved code at best_index to the ground truth code
    if codes_retrieved_all[i][best_index] == codes_ground_truth_all[i]:
        exact_matches += 1

# Compute exact match rate
exact_match_rate = exact_matches / instances_num
print(f"Exact Match Rate: {exact_match_rate:.4f}")


Exact Match Rate: 0.0000


In [3]:
from nltk.translate.bleu_score import corpus_bleu
from tqdm import tqdm

all_candidates = []  # each will be a tokenized candidate code snippet
all_references = []  # each will be a list containing one tokenized reference code snippet
for i in tqdm(range(instances_num), desc="BLEU scoring..."):
    # Find index of highest raw score
    best_index = np.argmax(scores_raw_all[i])

    candidate_str = codes_retrieved_all[i][best_index]
    reference_str = codes_ground_truth_all[i]

    # Tokenize; this is a simplistic approach!
    candidate_tokens = candidate_str.split()
    reference_tokens = reference_str.split()

    all_candidates.append(candidate_tokens)
    # NLTK’s corpus_bleu expects each reference to be a list-of-lists of tokens
    all_references.append([reference_tokens])

# Calculate corpus-level BLEU
bleu_score = corpus_bleu(all_references, all_candidates)
print("Corpus-level BLEU score:", bleu_score)

BLEU scoring...: 100%|██████████| 23107/23107 [00:00<00:00, 59845.96it/s]


Corpus-level BLEU score: 0.07487758390294394


In [4]:
import numpy as np
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

# Assuming you have:
#   top_k, instances_num
#   scores_raw_all: shape (instances_num, top_k)
#   codes_retrieved_all: list of length instances_num, each a list of length top_k
#   codes_ground_truth_all: list of length instances_num

# 1. Tokenizer (defined above)
import re

def tokenize_code(code_str):
    tokens = re.findall(r"[A-Za-z0-9_]+|\S", code_str)
    return tokens

smooth_fn = SmoothingFunction().method4  # method4 is a common smoothing method

bleu_scores = []

for i in range(instances_num):
    # Index of the highest raw score
    best_index = np.argmax(scores_raw_all[i])
    
    # Retrieve candidate and reference code strings
    candidate_str = codes_retrieved_all[i][best_index]
    reference_str = codes_ground_truth_all[i]
    
    # Tokenize
    candidate_tokens = tokenize_code(candidate_str)
    reference_tokens = tokenize_code(reference_str)
    
    # In NLTK's BLEU, reference is a list-of-lists
    # here we only have one reference (the ground truth)
    references = [reference_tokens]

    # Calculate *sentence-level* BLEU with 1 reference
    score = sentence_bleu(
        references,               # list of references
        candidate_tokens,         # candidate tokens
        smoothing_function=smooth_fn
    )
    bleu_scores.append(score)

# Finally, average these instance-level BLEU scores.
average_bleu = np.mean(bleu_scores)
# Often reported as a percentage or a float in [0..100].
print(f"Average sentence-level BLEU: {average_bleu:.4f} (i.e., {average_bleu*100:.2f}%)")


Average sentence-level BLEU: 0.1795 (i.e., 17.95%)


In [10]:
import re

# -----------------------------------------------------------------------
# 1. Make sure you've installed the CodeBLEU code in your environment
#    so that "from codebleu import calc_codebleu" works.
#    In the official CodeXGLUE repo, you usually do:
#        cd Code-Code/codebleu
#        pip install -e .
#    Then you can import it here:

from codebleu import calc_codebleu

# -----------------------------------------------------------------------
# 2. Define a simple tokenizer for code.

def tokenize_code(code_str):
    """
    A simplistic regex-based tokenizer for code.
    Splits on boundaries between alphanumeric/underscore 
    and non-alphanumeric/underscore characters.
    """
    tokens = re.findall(r"[A-Za-z0-9_]+|\S", code_str)
    return tokens

# -----------------------------------------------------------------------
# 3. Prepare code-level references and candidates:
#    We pick only the "best" retrieved snippet for each instance (top raw score).
#    Then store them as code strings (optionally tokenized) for CodeBLEU.

def prepare_candidates_and_references(scores_raw_all, codes_retrieved_all, codes_ground_truth_all):
    """
    :param scores_raw_all: shape (num_instances, top_k)
    :param codes_retrieved_all: list of length num_instances, each an array/list of top_k code strings
    :param codes_ground_truth_all: list of length num_instances, each a single ground-truth code string
    :return: two lists:
        candidates:  [candidate_string_i, ...] (one per instance)
        references:  [[ref_string_i], ...]     (list of lists; CodeBLEU can handle multiple references)
    """
    candidates = []
    references = []
    num_instances = scores_raw_all.shape[0]

    for i in range(num_instances):
        # Find index of best-scoring snippet
        best_index = np.argmax(scores_raw_all[i])
        candidate_str = codes_retrieved_all[i][best_index]
        reference_str = codes_ground_truth_all[i]

        # (Optional) Tokenize, then re-join as a single string.
        cand_tokens = tokenize_code(candidate_str)
        ref_tokens  = tokenize_code(reference_str)
        
        # CodeBLEU typically expects each instance's references
        # to be a list. If there's only one ground truth per instance,
        # we wrap it in a one-element list: [ref].
        candidates.append(" ".join(cand_tokens))
        references.append([" ".join(ref_tokens)])

    return candidates, references

prediction, reference = prepare_candidates_and_references(scores_raw_all, codes_retrieved_all, codes_ground_truth_all)
result = calc_codebleu([reference], [prediction], lang="python", weights=(0.25, 0.25, 0.25, 0.25), tokenizer=None)
print(result)


TypeError: an integer is required