# Setup

In [1]:
import json
from scipy import stats
from numpy import mean
from collections import Counter
import itertools
from numpy import nanmean

def l_extend(l):
    return [lll for ll in l for lll in ll]

# CoNaLa

In [2]:
conala_models_list = ['baseline', 'tranx-annot', 'best-tranx', 'best-tranx-rerank', 'codex']

exp_metrics = ["bleu","codebleu","chrf","rougel","meteor","ruby",
               "codebertscore_f1","codebertscore_s_f1",
               "codebertscore_f3","codebertscore_s_f3",
               "gpt35_nsnr","gpt35_nswr","gpt35_wsnr","gpt35_wswr"
              ]
real_name_metrics=["BLEU","CodeBLEU","chrF","ROUGE-L","METEOR","RUBY",
                   "CodeBERTSCORE-F1 (w/o S.)","CodeBERTSCORE-F1 (w/ S.)",
                   "CodeBERTSCORE-F3 (w/o S.)","CodeBERTSCORE-F3 (w/ S.)",
                   "GPT-3.5 (w/o R.)","GPT-3.5 (w/ R.)",
                   "GPT-3.5 (w/o R.) + 0-shot-CoT","GPT-3.5 (w/ R.) + 0-shot-CoT"
                  ]
def compute(data,metric,level="example"):
    refs,preds=[],[]
    for d in data:
        refs.append([d[f"grade-{k}"] for k in conala_models_list])
        preds.append([d[f"{metric}-{k}"] for k in conala_models_list])
    if level=="example":
        return nanmean([stats.kendalltau(ref,pred).statistic for ref,pred in zip(refs,preds)]),\
                nanmean([stats.pearsonr(ref,pred).statistic for ref,pred in zip(refs,preds)]),\
                nanmean([stats.spearmanr(ref,pred).statistic for ref,pred in zip(refs,preds)])
    else:
        return stats.kendalltau(l_extend(refs),l_extend(preds)).statistic,\
            stats.pearsonr(l_extend(refs),l_extend(preds)).statistic,\
            stats.spearmanr(l_extend(refs),l_extend(preds)).statistic

In [3]:
# Example Level
for m,rm in zip(exp_metrics,real_name_metrics):
    print(rm)
    kendalls,pearsons,spearmans=[],[],[]
    with open(f"data/conala/conala_grade.json") as f:
        data = json.load(f)
    kendall,pearson,spearman = compute(data,m,level="example")
    kendalls.append(kendall)
    pearsons.append(pearson)
    spearmans.append(spearman)
    print("\t\t",
          "{:.3f}".format(round(kendall,3))[1:],
          "{:.3f}".format(round(pearson,3))[1:],
          "{:.3f}".format(round(spearman,3))[1:])

BLEU
		 .439 .522 .488
CodeBLEU




		 .292 .363 .331
chrF
		 .458 .570 .515
ROUGE-L
		 .447 .529 .499
METEOR
		 .410 .507 .462
RUBY
		 .331 .397 .371
CodeBERTSCORE-F1 (w/o S.)
		 .499 .595 .558
CodeBERTSCORE-F1 (w/ S.)
		 .500 .609 .556
CodeBERTSCORE-F3 (w/o S.)
		 .485 .587 .542
CodeBERTSCORE-F3 (w/ S.)
		 .505 .609 .563
GPT-3.5 (w/o R.)
		 .556 .613 .594
GPT-3.5 (w/ R.)
		 .554 .617 .591
GPT-3.5 (w/o R.) + 0-shot-CoT
		 .561 .628 .600
GPT-3.5 (w/ R.) + 0-shot-CoT
		 .571 .639 .607


In [4]:
# Corpus Level
for m,rm in zip(exp_metrics,real_name_metrics):
    print(rm)
    kendalls,pearsons,spearmans=[],[],[]
    with open(f"data/conala/conala_grade.json") as f:
        data = json.load(f)
    kendall,pearson,spearman = compute(data,m,level="corpus")
    kendalls.append(kendall)
    pearsons.append(pearson)
    spearmans.append(spearman)
    print("\t\t",
          "{:.3f}".format(round(kendall,3))[1:],
          "{:.3f}".format(round(pearson,3))[1:],
          "{:.3f}".format(round(spearman,3))[1:])

BLEU
		 .423 .572 .542
CodeBLEU
		 .259 .397 .339
chrF
		 .449 .592 .578
ROUGE-L
		 .432 .581 .552
METEOR
		 .415 .557 .534
RUBY
		 .339 .493 .439
CodeBERTSCORE-F1 (w/o S.)
		 .460 .579 .589
CodeBERTSCORE-F1 (w/ S.)
		 .464 .579 .595
CodeBERTSCORE-F3 (w/o S.)
		 .441 .556 .568
CodeBERTSCORE-F3 (w/ S.)
		 .437 .549 .564
GPT-3.5 (w/o R.)
		 .546 .649 .635
GPT-3.5 (w/ R.)
		 .539 .661 .630
GPT-3.5 (w/o R.) + 0-shot-CoT
		 .579 .703 .665
GPT-3.5 (w/ R.) + 0-shot-CoT
		 .583 .712 .667


# HumanEval-X

In [5]:
baseline_metrics = ["bleu","codebleu","chrf","rougel","meteor","ruby","codebertscore","gpt35"]
exp_metrics = ["bleu","codebleu","chrf","rougel","meteor","ruby",
               "codebertscore_f1","codebertscore_s_f1",
               "codebertscore_f3","codebertscore_s_f3",
               "gpt35_nsnr","gpt35_nswr"
              ]
real_name_metrics=["BLEU","CodeBLEU","chrF","ROUGE-L","METEOR","RUBY",
                   "CodeBERTSCORE-F1 (w/o S.)","CodeBERTSCORE-F1 (w/ S.)",
                   "CodeBERTSCORE-F3 (w/o S.)","CodeBERTSCORE-F3 (w/ S.)",
                   "GPT-3.5 (w/o R.)","GPT-3.5 (w/ R.)"
                  ]
def compute(data,metric,level="example"):
    refs,preds=[],[]
    for d in data:
        ks=[k.replace("grade-","") for k in d.keys() if k.startswith("grade-")]
        refs.append([d[f"grade-{k}"]["execution"] for k in ks])
        preds.append([d[f"{metric}-{k}"] for k in ks])
    if level=="example":
        return nanmean([stats.kendalltau(ref,pred).statistic for ref,pred in zip(refs,preds)]),\
                nanmean([stats.pearsonr(ref,pred).statistic for ref,pred in zip(refs,preds)]),\
                nanmean([stats.spearmanr(ref,pred).statistic for ref,pred in zip(refs,preds)])
    else:
        return stats.kendalltau(l_extend(refs),l_extend(preds)).statistic,\
            stats.pearsonr(l_extend(refs),l_extend(preds)).statistic,\
            stats.spearmanr(l_extend(refs),l_extend(preds)).statistic

In [6]:
# Example Level
for m,rm in zip(exp_metrics,real_name_metrics):
    print(rm)
    kendalls,pearsons,spearmans=[],[],[]
    for l in ["java","cpp","python","js"]:
        with open(f"data/humaneval/humaneval_{l}_grade.json") as f:
            data = json.load(f)
        kendall,pearson,spearman = compute(data,m,level="example")
        kendalls.append(kendall)
        pearsons.append(pearson)
        spearmans.append(spearman)
        print("\t",l)
        print("\t\t",
              "{:.3f}".format(round(kendall,3))[1:],
              "{:.3f}".format(round(spearman,3))[1:])
    print("\t","average")
    print("\t\t",
          "{:.3f}".format(round(mean(kendalls),3))[1:],
          "{:.3f}".format(round(mean(spearmans),3))[1:])

BLEU
	 java
		 .337 .401
	 cpp
		 .146 .174
	 python
		 .251 .297
	 js
		 .168 .199
	 average
		 .225 .268
CodeBLEU
	 java
		 .355 .421
	 cpp
		 .157 .187
	 python
		 .272 .323
	 js
		 .226 .267
	 average
		 .253 .299
chrF
	 java
		 .346 .413
	 cpp
		 .166 .198
	 python
		 .262 .312
	 js
		 .186 .220
	 average
		 .240 .286
ROUGE-L
	 java
		 .327 .389
	 cpp
		 .143 .171
	 python
		 .240 .284
	 js
		 .151 .179
	 average
		 .215 .256
METEOR
	 java
		 .358 .425
	 cpp
		 .174 .208
	 python
		 .276 .327
	 js
		 .195 .231
	 average
		 .251 .298
RUBY
	 java
		 .340 .401
	 cpp
		 .139 .165
	 python
		 .216 .255
	 js
		 .138 .163
	 average
		 .208 .246
CodeBERTSCORE-F1 (w/o S.)
	 java
		 .333 .398
	 cpp
		 .146 .175
	 python
		 .237 .283
	 js
		 .148 .176
	 average
		 .216 .258
CodeBERTSCORE-F1 (w/ S.)
	 java
		 .314 .375
	 cpp
		 .148 .177
	 python
		 .231 .276
	 js
		 .145 .172
	 average
		 .209 .250
CodeBERTSCORE-F3 (w/o S.)
	 java
		 .359 .429
	 cpp
		 .169 .202
	 python
		 .265 .316
	 js
		

In [7]:
# Corpus Level
for m,rm in zip(exp_metrics,real_name_metrics):
    print(rm)
    kendalls,pearsons,spearmans=[],[],[]
    for l in ["java","cpp","python","js"]:
        with open(f"data/humaneval/humaneval_{l}_grade.json") as f:
            data = json.load(f)
        kendall,pearson,spearman = compute(data,m,level="corpus")
        kendalls.append(kendall)
        pearsons.append(pearson)
        spearmans.append(spearman)
        print("\t",l)
        print("\t\t",
              "{:.3f}".format(round(kendall,3))[1:],
              "{:.3f}".format(round(spearman,3))[1:])
    print("\t","average")
    print("\t\t",
          "{:.3f}".format(round(mean(kendalls),3))[1:],
          "{:.3f}".format(round(mean(spearmans),3))[1:])

BLEU
	 java
		 .267 .326
	 cpp
		 .225 .276
	 python
		 .281 .344
	 js
		 .220 .270
	 average
		 .248 .304
CodeBLEU
	 java
		 .293 .359
	 cpp
		 .212 .260
	 python
		 .303 .371
	 js
		 .315 .385
	 average
		 .281 .343
chrF
	 java
		 .290 .355
	 cpp
		 .266 .325
	 python
		 .328 .402
	 js
		 .279 .342
	 average
		 .291 .356
ROUGE-L
	 java
		 .280 .342
	 cpp
		 .234 .286
	 python
		 .296 .363
	 js
		 .216 .264
	 average
		 .256 .314
METEOR
	 java
		 .318 .389
	 cpp
		 .260 .319
	 python
		 .349 .427
	 js
		 .311 .380
	 average
		 .309 .379
RUBY
	 java
		 .276 .337
	 cpp
		 .219 .268
	 python
		 .279 .341
	 js
		 .219 .268
	 average
		 .248 .303
CodeBERTSCORE-F1 (w/o S.)
	 java
		 .299 .367
	 cpp
		 .266 .326
	 python
		 .322 .394
	 js
		 .248 .303
	 average
		 .284 .348
CodeBERTSCORE-F1 (w/ S.)
	 java
		 .244 .298
	 cpp
		 .219 .268
	 python
		 .264 .324
	 js
		 .214 .262
	 average
		 .235 .288
CodeBERTSCORE-F3 (w/o S.)
	 java
		 .326 .399
	 cpp
		 .283 .347
	 python
		 .360 .441
	 js
		