In [1]:
import json
import numpy as np

top_k = 200
instances_num = 23107
scores_raw_all = np.zeros((instances_num, top_k))
codes_retrieved_all = [] # list of instances_num lists, each list has top_k items
codes_ground_truth_all = [] # list of instances_num code strings
row = 0
for file in range(1, 25):
    with open("output_ret/retrieval_chunk_" + str(file) + "_top200_result.json") as f:
        data = json.load(f)
        for i in range(len(data)):
            codes_ground_truth_all.append(data[i]["answers"])
            scores_raw_all[row] = np.array([data[i]["ctxs"][j]["score"] for j in range(top_k)])
            codes_retrieved_all.append([data[i]["ctxs"][j]["text"] for j in range(top_k)])
            row += 1

# data has 1000 items, 
# for each item "data[i]" i in [0, 999], there are associated "question" (code prompt, str), "answers" (ground truth code, str), and a "ctxs"
# each "ctxs" is a list of dictionaries "data[i]["ctxs"][j]" j in [0, 199], each dictionary has 5 keys:
# ['id' (path, str), 'title' (str), 'text' (retrieved code, str), 'score' (similarity, float), 'has_answer' (bool)]

# check for json file loading
# print(0 in scores_raw_all) # False
# for i in range(len(codes_retrieved_all)): assert len(codes_retrieved_all[i]) == top_k
assert row == len(codes_retrieved_all) == len(codes_ground_truth_all) == instances_num

In [2]:
exact_matches = 0
for i in range(instances_num):
    # Find the index of the top raw score for instance i
    best_index = np.argmax(scores_raw_all[i])
    # Compare the retrieved code at best_index to the ground truth code
    if codes_retrieved_all[i][best_index] == codes_ground_truth_all[i]:
        exact_matches += 1

# Compute exact match rate
exact_match_rate = exact_matches / instances_num
print(f"Exact Match Rate: {exact_match_rate:.4f}")


Exact Match Rate: 0.0000


In [3]:
from nltk.translate.bleu_score import corpus_bleu
from tqdm import tqdm

all_candidates = []  # each will be a tokenized candidate code snippet
all_references = []  # each will be a list containing one tokenized reference code snippet
for i in tqdm(range(instances_num), desc="BLEU scoring..."):
    # Find index of highest raw score
    best_index = np.argmax(scores_raw_all[i])

    candidate_str = codes_retrieved_all[i][best_index]
    reference_str = codes_ground_truth_all[i]

    # Tokenize; this is a simplistic approach!
    candidate_tokens = candidate_str.split()
    reference_tokens = reference_str.split()

    all_candidates.append(candidate_tokens)
    # NLTK’s corpus_bleu expects each reference to be a list-of-lists of tokens
    all_references.append([reference_tokens])

# Calculate corpus-level BLEU
bleu_score = corpus_bleu(all_references, all_candidates)
print("Corpus-level BLEU score:", bleu_score)

BLEU scoring...: 100%|██████████| 23107/23107 [00:00<00:00, 59845.96it/s]


Corpus-level BLEU score: 0.07487758390294394


In [4]:
import numpy as np
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

# Assuming you have:
#   top_k, instances_num
#   scores_raw_all: shape (instances_num, top_k)
#   codes_retrieved_all: list of length instances_num, each a list of length top_k
#   codes_ground_truth_all: list of length instances_num

# 1. Tokenizer (defined above)
import re

def tokenize_code(code_str):
    tokens = re.findall(r"[A-Za-z0-9_]+|\S", code_str)
    return tokens

smooth_fn = SmoothingFunction().method4  # method4 is a common smoothing method

bleu_scores = []

for i in range(instances_num):
    # Index of the highest raw score
    best_index = np.argmax(scores_raw_all[i])
    
    # Retrieve candidate and reference code strings
    candidate_str = codes_retrieved_all[i][best_index]
    reference_str = codes_ground_truth_all[i]
    
    # Tokenize
    candidate_tokens = tokenize_code(candidate_str)
    reference_tokens = tokenize_code(reference_str)
    
    # In NLTK's BLEU, reference is a list-of-lists
    # here we only have one reference (the ground truth)
    references = [reference_tokens]

    # Calculate *sentence-level* BLEU with 1 reference
    score = sentence_bleu(
        references,               # list of references
        candidate_tokens,         # candidate tokens
        smoothing_function=smooth_fn
    )
    bleu_scores.append(score)

# Finally, average these instance-level BLEU scores.
average_bleu = np.mean(bleu_scores)
# Often reported as a percentage or a float in [0..100].
print(f"Average sentence-level BLEU: {average_bleu:.4f} (i.e., {average_bleu*100:.2f}%)")


Average sentence-level BLEU: 0.1795 (i.e., 17.95%)


In [12]:
import re
import ast
import math
import numpy as np
from collections import Counter, deque

##############################################################################
# 1) Simple Code Tokenizer
##############################################################################

def tokenize_code(code_str):
    """
    Splits code on boundaries between alphanumeric/underscore and non-alphanumeric.
    E.g., "def foo(x): return x+1" -> ["def", "foo", "(", "x", ")", ":", "return", "x", "+", "1"]
    """
    tokens = re.findall(r"[A-Za-z0-9_]+|\S", code_str)
    return tokens

##############################################################################
# 2) N-Gram BLEU (token-based), approximate from-scratch
##############################################################################

def ngram_counts(tokens, n):
    """
    Return a Counter of all n-grams of length `n` in `tokens`.
    Example: tokens = ['def','foo','(',')']
             n=2 -> Counter({('def','foo'):1, ('foo','('):1, ('(',')'):1})
    """
    counts = Counter()
    for i in range(len(tokens) - n + 1):
        ngram = tuple(tokens[i:i+n])
        counts[ngram] += 1
    return counts

def clipped_ngram_count(candidate_tokens, reference_tokens, n):
    """
    Compute clipped count = sum of min(candidate_count(ngram), reference_count(ngram)) over all n-grams.
    """
    cand_counter = ngram_counts(candidate_tokens, n)
    ref_counter  = ngram_counts(reference_tokens, n)
    clipped_sum = 0
    for ngram, cand_val in cand_counter.items():
        clipped_sum += min(cand_val, ref_counter.get(ngram, 0))
    return clipped_sum

def sentence_bleu(candidate_tokens, reference_tokens, max_n=4, smooth_eps=1e-14):
    """
    Approximate sentence-level BLEU (no brevity penalty in this simplified version),
    with basic smoothing to avoid zero counts.
    """
    # Precision for each n-gram
    precisions = []
    for n in range(1, max_n+1):
        numerator = clipped_ngram_count(candidate_tokens, reference_tokens, n)
        denominator = max(len(candidate_tokens) - n + 1, 1)
        # Add smoothing to avoid 0
        prec_n = (numerator + smooth_eps) / (denominator + smooth_eps)
        precisions.append(prec_n)

    # Geometric mean of precisions
    # BLEU = exp( (1/N) * sum(log p_n) ), N=4 typically
    if any(p == 0 for p in precisions):
        bleu = 0.0
    else:
        avg_log = sum(math.log(p) for p in precisions) / len(precisions)
        bleu = math.exp(avg_log)

    # Very naive length penalty if you want (optional).
    # Let's skip or do a minimal ratio:
    ratio = len(candidate_tokens) / (len(reference_tokens) + smooth_eps)
    if ratio < 1.0:
        # Simple brevity penalty
        bleu *= math.exp(1 - 1/ratio)

    return bleu

##############################################################################
# 3) Weighted N-Gram BLEU: example weighting by token type
##############################################################################

# A minimal set of Python keywords
PY_KEYWORDS = {
    "def", "class", "return", "for", "while", "if", "else", "elif", "try", "except",
    "with", "import", "as", "from", "pass", "break", "continue", "lambda", "yield",
    "raise", "in", "and", "or", "not", "is", "global", "del", "assert"
}

def token_type(token):
    """
    Rough classification:
    - "KEY" for Python keywords
    - "ID" for variable/function identifiers (alphanumeric + underscore, not keyword)
    - "NUM" if purely numeric
    - "OP" for punctuation/operators
    - "STR" if token starts/ends with quotes (naive check)
    """
    if token in PY_KEYWORDS:
        return "KEY"
    # naive string check
    if (token.startswith("'") and token.endswith("'")) or (token.startswith('"') and token.endswith('"')):
        return "STR"
    # numeric check
    if token.isdigit():
        return "NUM"
    # check if alphanumeric+_ (identifier-like)
    if re.match(r"^[A-Za-z0-9_]+$", token):
        return "ID"
    # otherwise operator/punctuation
    return "OP"

def weighted_ngram_bleu(candidate_tokens, reference_tokens, max_n=4, smooth_eps=1e-14):
    """
    Weighted BLEU: if an n-gram is "key" or "id" or "num", we can weight it differently.
    For simplicity, we do: KEY=1.2, ID=1.0, NUM=1.0, OP=0.8, STR=1.0
    Then we do a clipped count but accumulate those weights.
    """
    # weights for each token-type
    weight_map = {"KEY":1.2, "ID":1.0, "NUM":1.0, "OP":0.8, "STR":1.0}

    # We'll define a function to get the "type-weight product" for an n-gram
    def ngram_weight(ngram):
        # average or product? Let's do average of token-type weights
        # so the n-gram weight is an average of the token-type weights.
        tw_sum = 0.0
        for tok in ngram:
            tw_sum += weight_map.get(token_type(tok), 1.0)
        return tw_sum / len(ngram)

    # Count n-grams in candidate and reference
    cand_counts = ngram_counts(candidate_tokens, 1)  # for denominator
    precisions = []

    for n in range(1, max_n+1):
        cand_counter = ngram_counts(candidate_tokens, n)
        ref_counter  = ngram_counts(reference_tokens, n)
        # Weighted clipped sum
        weighted_clip_sum = 0.0
        total_candidate_ngrams = 0.0

        for ngram, cand_val in cand_counter.items():
            # how many times does it appear in cand vs ref?
            clip_amount = min(cand_val, ref_counter.get(ngram, 0))
            if clip_amount > 0:
                weighted_clip_sum += clip_amount * ngram_weight(ngram)
            total_candidate_ngrams += cand_val * ngram_weight(ngram)

        if total_candidate_ngrams < smooth_eps:
            prec_n = 0.0
        else:
            prec_n = (weighted_clip_sum + smooth_eps) / (total_candidate_ngrams + smooth_eps)
        precisions.append(prec_n)

    # geometric mean
    if any(p == 0 for p in precisions):
        wbleu = 0.0
    else:
        avg_log = sum(math.log(p) for p in precisions) / len(precisions)
        wbleu = math.exp(avg_log)

    # naive brevity penalty
    ratio = len(candidate_tokens) / (len(reference_tokens) + smooth_eps)
    if ratio < 1.0:
        wbleu *= math.exp(1 - 1/ratio)

    return wbleu

##############################################################################
# 4) Syntax (AST) Match: parse Python code + BFS of node types, do n-gram similarity
##############################################################################

def ast_node_bfs_types(root):
    """
    BFS traversal of AST node types (as strings).
    Returns a list of node type names, e.g. ["Module", "FunctionDef", "arguments", "arg", ...].
    """
    queue = deque([root])
    types = []
    while queue:
        node = queue.popleft()
        types.append(type(node).__name__)
        for child in ast.iter_child_nodes(node):
            queue.append(child)
    return types

def syntax_match_score(candidate_code, reference_code, smooth_eps=1e-14):
    """
    Parse code into Python AST. Then BFS node types. Then measure 1-gram to 4-gram overlap 
    (similar to BLEU). Return a float in [0..1].
    """
    try:
        cand_root = ast.parse(candidate_code)
        ref_root  = ast.parse(reference_code)
    except Exception:
        # If parse fails, return 0
        return 0.0

    cand_types = ast_node_bfs_types(cand_root)
    ref_types  = ast_node_bfs_types(ref_root)

    # We'll just reuse `sentence_bleu` logic on these "type tokens"
    return sentence_bleu(cand_types, ref_types, max_n=4, smooth_eps=smooth_eps)

##############################################################################
# 5) Data-Flow Match (naive variable "def" lines)
##############################################################################

class DefUseVisitor(ast.NodeVisitor):
    """
    Collect line info of variable definitions in Python code.
    e.g. for i in range(5): i is "defined" or assigned. 
    We'll store them as (varname, first_def_line).
    """
    def __init__(self):
        self.defs = {}

    def visit_Name(self, node):
        # If this name is being assigned, store line number if not present
        if isinstance(node.ctx, ast.Store):
            var = node.id
            if var not in self.defs:
                self.defs[var] = node.lineno
        self.generic_visit(node)

def dataflow_match_score(candidate_code, reference_code):
    """
    We do a naive approach: gather (var, line_of_def) for each code. 
    Then measure the overlap. Score ~ overlap / union in [0..1].
    """
    def get_var_defs(code):
        try:
            root = ast.parse(code)
        except:
            return set()
        v = DefUseVisitor()
        v.visit(root)
        # Return set of (var, line)
        return set(v.defs.items())

    cand_defs = get_var_defs(candidate_code)
    ref_defs  = get_var_defs(reference_code)

    if len(cand_defs) == 0 and len(ref_defs) == 0:
        return 1.0  # trivially no variables in both
    overlap = len(cand_defs.intersection(ref_defs))
    union  = len(cand_defs.union(ref_defs))
    return overlap / (union + 1e-14)

##############################################################################
# 6) Combine the 4 sub-scores into a final CodeBLEU
##############################################################################

def calc_codebleu(candidate_code, reference_code, alpha=0.25, beta=0.25, gamma=0.25, theta=0.25):
    """
    Returns a dict with sub-scores + final code_bleu in [0..1].
    """
    # 1) Tokenize
    cand_tokens = tokenize_code(candidate_code)
    ref_tokens  = tokenize_code(reference_code)

    # 2) n-gram BLEU
    ngram_score = sentence_bleu(cand_tokens, ref_tokens, max_n=4)

    # 3) Weighted n-gram BLEU
    wngram_score = weighted_ngram_bleu(cand_tokens, ref_tokens, max_n=4)

    # 4) Syntax (AST) match
    syntax_score = syntax_match_score(candidate_code, reference_code)

    # 5) Data-flow
    dataflow_score = dataflow_match_score(candidate_code, reference_code)

    # Weighted sum
    codebleu = (alpha*ngram_score 
               + beta*wngram_score
               + gamma*syntax_score
               + theta*dataflow_score)

    return {
        "ngram_match_score": ngram_score,
        "weighted_ngram_match_score": wngram_score,
        "syntax_match_score": syntax_score,
        "dataflow_match_score": dataflow_score,
        "code_bleu": codebleu
    }

##############################################################################
# 7) Evaluate a code retriever: for each instance, pick the best snippet 
#    and compute average CodeBLEU
##############################################################################

def evaluate_codebleu_for_retriever(
    scores_raw_all, 
    codes_retrieved_all, 
    codes_ground_truth_all,
    alpha=0.25, beta=0.25, gamma=0.25, theta=0.25
):
    """
    - scores_raw_all: shape (num_instances, top_k)
    - codes_retrieved_all: list of length num_instances, each a list of top_k code strings
    - codes_ground_truth_all: list of length num_instances, each a single ground-truth code string
    - returns a dict with average sub-scores and final code_bleu across all instances
    """
    num_instances = scores_raw_all.shape[0]
    ngram_vals = []
    wngram_vals = []
    syntax_vals = []
    dataflow_vals = []
    codebleu_vals = []

    for i in tqdm(range(num_instances), desc="Evaluating CodeBLEU..."):
        # 1) Find best snippet
        best_idx = np.argmax(scores_raw_all[i])
        candidate_code = codes_retrieved_all[i][best_idx]
        reference_code = codes_ground_truth_all[i]

        # 2) Calculate sub-scores
        result = calc_codebleu(
            candidate_code, 
            reference_code, 
            alpha=alpha, 
            beta=beta, 
            gamma=gamma, 
            theta=theta
        )
        # But note: we used the same alpha,beta,gamma,theta in calc_codebleu 
        # for the final combination. You can also do sub-scores separately 
        # and combine them outside.

        ngram_vals.append(result["ngram_match_score"])
        wngram_vals.append(result["weighted_ngram_match_score"])
        syntax_vals.append(result["syntax_match_score"])
        dataflow_vals.append(result["dataflow_match_score"])
        codebleu_vals.append(result["code_bleu"])

    # Average across all instances
    avg_ngram   = np.mean(ngram_vals)
    avg_wngram  = np.mean(wngram_vals)
    avg_syntax  = np.mean(syntax_vals)
    avg_dataflow= np.mean(dataflow_vals)
    avg_codebleu= np.mean(codebleu_vals)

    return {
        "avg_ngram_match_score": avg_ngram,
        "avg_weighted_ngram_match_score": avg_wngram,
        "avg_syntax_match_score": avg_syntax,
        "avg_dataflow_match_score": avg_dataflow,
        "avg_code_bleu": avg_codebleu
    }

##############################################################################
# 8) Run the evaluation for the retriever
results = evaluate_codebleu_for_retriever(
    scores_raw_all, codes_retrieved_all, codes_ground_truth_all,
    alpha=0.25, beta=0.25, gamma=0.25, theta=0.25
)

print("CodeBLEU sub-scores:")
print("  ngram_match_score:", results["avg_ngram_match_score"])
print("  weighted_ngram_match_score:", results["avg_weighted_ngram_match_score"])
print("  syntax_match_score:", results["avg_syntax_match_score"])
print("  dataflow_match_score:", results["avg_dataflow_match_score"])
print(f"Overall CodeBLEU ~ {results['avg_code_bleu']:.4f} in [0..1]")
print(f"                ~ {results['avg_code_bleu']*100:.2f}%")


Evaluating CodeBLEU...: 100%|██████████| 23107/23107 [00:29<00:00, 776.33it/s] 

CodeBLEU sub-scores:
  ngram_match_score: 0.1743050169559238
  weighted_ngram_match_score: 0.17393650585395345
  syntax_match_score: 0.02536586482916037
  dataflow_match_score: 0.18617734885532522
Overall CodeBLEU ~ 0.1399 in [0..1]
                ~ 13.99%



