In [1]:
import sys, subprocess, json, math, os, re, logging, itertools
from pathlib import Path
import pandas as pd
from tqdm import tqdm

import nltk
nltk.download("wordnet", quiet=True)
nltk.download("omw-1.4", quiet=True)

import sacrebleu, Levenshtein
from rouge_score import rouge_scorer
from sentence_transformers import SentenceTransformer, util, models
from bert_score import score as bertscore
from metrics_evaluation.metrics import codebleu

In [2]:
# ---------- 3. CrystalBLEU helper ---------------------------------------------------------------
COMMON_PATTERNS = {
    "for(": None, "while(": None, "if(": None, "std::": None,
    "ns3::": None, "return": None, "{": None, "}": None, ";": None,
}
def filter_common_ngrams(code: str) -> str:
    tokens = code.split()
    return " ".join(t for t in tokens if t not in COMMON_PATTERNS)

def crystal_bleu(ref: str, hyp: str) -> float:
    ref_f = filter_common_ngrams(ref)
    hyp_f = filter_common_ngrams(hyp)
    return sacrebleu.corpus_bleu([hyp_f], [[ref_f]]).score

# ---------- 4. clone-similarity (token Jaccard) -------------------------------------------------
def token_jaccard(ref: str, hyp: str) -> float:
    tok_ref = set(re.findall(r"[A-Za-z_]\w+", ref))
    tok_hyp = set(re.findall(r"[A-Za-z_]\w+", hyp))
    return len(tok_ref & tok_hyp) / (len(tok_ref | tok_hyp) or 1)

# ---------- 5. CodeBERT models (embedding + BERTScore) ------------------------------------------
word_embedding_model = models.Transformer('microsoft/codebert-base')

# Configure pooling to use the [CLS] token, as recommended by the CodeBERT paper
pooling_model = models.Pooling(
    word_embedding_model.get_word_embedding_dimension(),
    pooling_mode='cls'  # Use CLS token instead of default mean pooling
)

# Build the SentenceTransformer model
embed_model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
def bertscore_code(refs, hyps):
    P, R, F = bertscore(hyps, refs, lang="en", model_type="microsoft/codebert-base",
                        num_layers=12, verbose=False, idf=False)
    return F.mean().item()

# ---------- 6. data paths -----------------------------------------------------------------------
SRC_DIRS = [Path("../Dataset/Codes/Large"), Path("../Dataset/Codes/Small")]
GEN_DIR  = Path("Basic/Gemini")          # from earlier notebook cells
GEN_DIR.mkdir(parents=True, exist_ok=True)

In [3]:
rows = []
scorer_rouge = rouge_scorer.RougeScorer(["rougeL"], use_stemmer=False)

for ref_path in tqdm(list(itertools.chain.from_iterable(d.glob("*.cc") for d in SRC_DIRS)),
                     desc="Scoring"):
    ref_code = ref_path.read_text(encoding="utf-8", errors="ignore")
    gen_path = GEN_DIR / ref_path.name
    if gen_path.exists():
        hyp_code = gen_path.read_text(encoding="utf-8", errors="ignore")
    else:
        hyp_code = None  # will yield NaNs

    def safe(metric_fn, *, default=float("nan")):
        try:
            return metric_fn()
        except Exception as e:
            logging.warning("Metric failed (%s, %s): %s", ref_path.name, metric_fn.__name__, e)
            return default

    row = {
        "file": ref_path.name,
        "BLEU": safe(lambda: sacrebleu.corpus_bleu([hyp_code], [[ref_code]]).score if hyp_code else float("nan")),
        "ROUGE_L": safe(lambda: scorer_rouge.score(ref_code, hyp_code)["rougeL"].fmeasure * 100
                        if hyp_code else float("nan")),
        "ChrF": safe(lambda: sacrebleu.corpus_chrf([hyp_code], [[ref_code]]).score if hyp_code else float("nan")),
        "CodeBLEU": safe(lambda: codebleu(ref_code, hyp_code) * 100 if hyp_code else float("nan")),
        "Levenshtein": safe(lambda: Levenshtein.distance(ref_code, hyp_code) if hyp_code else float("nan")),
        "CodeBERTScore": safe(lambda: bertscore_code([ref_code], [hyp_code]) * 100 if hyp_code else float("nan")),
        "EmbeddingCosine": safe(lambda: util.cos_sim(embed_model.encode(ref_code, convert_to_tensor=True),
                                                     embed_model.encode(hyp_code, convert_to_tensor=True)).item()
                                if hyp_code else float("nan")),
        "CloneJaccard": safe(lambda: token_jaccard(ref_code, hyp_code) * 100 if hyp_code else float("nan")),
    }
    rows.append(row)


Scoring: 100%|██████████| 400/400 [04:43<00:00,  1.41it/s]


In [4]:
prompt_type = "Basic"
model_name = "Gemini"
df = pd.DataFrame(rows)
avg_row = {"file": "AVERAGE"}
for col in df.columns[1:]:
    avg_row[col] = df[col].mean()
df = pd.concat([df, pd.DataFrame([avg_row])], ignore_index=True)

csv_path = Path(f"{prompt_type}_{model_name}_metrics.csv")
df.to_csv(csv_path, index=False)
print(f"✅ Saved metrics to {csv_path.resolve()}")

✅ Saved metrics to E:\LLM\simcode-bench\Generation\Basic_Gemini_metrics.csv


In [5]:
df.head()

Unnamed: 0,file,BLEU,ROUGE_L,ChrF,CodeBLEU,Levenshtein,CodeBERTScore,EmbeddingCosine,CloneJaccard
0,1.cc,47.27816,61.420345,74.987273,74.36163,1028.0,94.146919,0.996086,55.725191
1,10.cc,27.283335,24.504249,57.985073,74.908191,5428.0,90.853161,0.996697,29.207921
2,100.cc,11.568054,25.242131,31.50735,36.98215,18456.0,90.522099,0.997032,26.90678
3,101.cc,39.386718,29.272152,55.54455,75.618515,4193.0,87.129527,0.993921,29.906542
4,102.cc,19.549206,15.433404,42.653884,58.75252,7562.0,88.665617,0.990747,20.229885
