In [57]:
import yaml
from matplotlib import pyplot as plt

def compute_stats(file_path: str):
    with open(file_path, 'r') as f:
        full_results = yaml.load(f, yaml.Loader)
    peritem_full_score = {}
    peritem_get_score = {}
    peritem_freq = {}
    perlang_full_score = {}
    perlang_get_score = {}
    perlang_freq = {}
    perarea_full_score = {}
    perarea_get_score = {}
    perarea_freq = {}
    tot_score = 0.
    full_score = 0.
    
    fields = ['keywords', 'blank_filling', 'unit_test', 'similarity', 'customized']
    
    for case_name, case_report in full_results.items():
        case_full_score = case_report['full_score']
        
        # locate the instance with highest score
        best_score = 0.
        best_response_peritem_full_score = {}
        best_response_peritem_get_score = {}
        for response_report in case_report['detail']:
            now_score = 0.
            response_peritem_full_score = {}
            response_peritem_get_score = {}
            for field in fields:
                if field + '_score' in response_report:
                    now_score += response_report[field + '_score']
                    response_peritem_full_score[field] = response_report[field + '_totscore']
                    response_peritem_get_score[field] = response_report[field + '_score']
            if now_score >= best_score:
                best_score = now_score
                best_response_peritem_full_score = response_peritem_full_score
                best_response_peritem_get_score = response_peritem_get_score
        
        tot_case_score = sum(best_response_peritem_full_score.values())
        for item in best_response_peritem_full_score:
            best_response_peritem_full_score[item] *= case_full_score / tot_case_score
            best_response_peritem_get_score[item] *= case_full_score / tot_case_score
        
        for item in best_response_peritem_full_score:
            if item not in peritem_get_score:
                peritem_get_score[item] = 0.
                peritem_full_score[item] = 0.
                peritem_freq[item] = 0.
            peritem_get_score[item] += best_response_peritem_get_score[item]
            peritem_full_score[item] += best_response_peritem_full_score[item]
            peritem_freq[item] += 1
        
        now_score = case_report['now_score']
        now_lang = case_report['lang']
        now_area = case_report['type']
        
        tot_score += now_score
        full_score += case_report['full_score']
        
        if now_lang not in perlang_get_score:
            perlang_get_score[now_lang] = 0.
            perlang_full_score[now_lang] = 0.
            perlang_freq[now_lang] = 0.
        
        perlang_get_score[now_lang] += now_score
        perlang_full_score[now_lang] += case_full_score
        perlang_freq[now_lang] += 1.
        
        if now_area not in perarea_get_score:
            perarea_get_score[now_area] = 0.
            perarea_full_score[now_area] = 0.
            perarea_freq[now_area] = 0.
        
        perarea_get_score[now_area] += now_score
        perarea_full_score[now_area] += case_full_score
        perarea_freq[now_area] += 1.
        
    langs = list(perlang_get_score.keys())
    areas = list(perarea_get_score.keys())
    
    return {
        'tot_score': tot_score,
        'tot_full_score': full_score,
        'lang': langs,
        'area': areas,
        'field': fields,
        'perarea_freq': perarea_freq,
        'perarea_get_score': perarea_get_score,
        'perarea_full_score': perarea_full_score,
        'perlang_freq': perlang_freq,
        'perlang_get_score': perlang_get_score,
        'perlang_full_score': perlang_full_score,
        'peritem_freq': peritem_freq,
        'peritem_get_score': peritem_get_score,
        'peritem_full_score': peritem_full_score
    }


In [64]:
gpt4_stats = compute_stats('results/suite_v1_avg_gpt-4_0.2_0.9_10.yaml')

In [65]:
gpt35_stats = compute_stats('results/suite_v1_avg_gpt-3.5-turbo_0.2_0.9_10.yaml')

In [66]:
def tab_gen(stats, cols):
    cols = [''] + sum([[col, ''] for col in cols], []) + ['Full Score', 'Allocation']
    lines = [cols]
    # Tot score
    lines.append(['Overall Score'] + sum([[stat['tot_score'], stat['tot_score'] / stats[0]['tot_full_score']] for stat in stats], []) + [stats[0]['tot_full_score'], ''])
    # Score by Lang
    lang_ranks = sorted(stats[0]['lang'], key=lambda x: stats[0]['perlang_full_score'][x], reverse=True)
    lines += [['Lang: ' + lang] + sum([[stat['perlang_get_score'][lang], stat['perlang_get_score'][lang] / stats[0]['perlang_full_score'][lang]] for stat in stats], []) + [stats[0]['perlang_full_score'][lang], stats[0]['perlang_full_score'][lang] / stats[0]['tot_full_score']] for lang in lang_ranks]
    # Score by Area
    area_ranks = sorted(stats[0]['area'], key=lambda x: stats[0]['perarea_full_score'][x], reverse=True)
    lines += [['Type: ' + area] + sum([[stat['perarea_get_score'][area], stat['perarea_get_score'][area] / stats[0]['perarea_full_score'][area]] for stat in stats], []) + [stats[0]['perarea_full_score'][area], stats[0]['perarea_full_score'][area] / stats[0]['tot_full_score']] for area in area_ranks]
    # Score by Evaluation metric
    item_ranks = sorted(stats[0]['peritem_full_score'].keys(), key=lambda x: stats[0]['peritem_full_score'][x], reverse=True)
    lines += [['Metric: ' + item] + sum([[stat['peritem_get_score'][item], stat['peritem_get_score'][item] / stats[0]['peritem_full_score'][item]] for stat in stats], []) + [stats[0]['peritem_full_score'][item], stats[0]['peritem_full_score'][item] / stats[0]['tot_full_score']] for item in item_ranks]
    return lines

In [67]:
tab = tab_gen([gpt4_stats, gpt35_stats], ['gpt4', 'gpt3.5'])

In [68]:
def to_csv(tab, path):
    with open(path, 'w') as f:
        s = ''
        for row in tab:
            for j, item in enumerate(row):
                if j > 0: s += ','
                if isinstance(item, str): 
                    s += item
                elif isinstance(item, float): 
                    if j % 2 == 1: 
                        s += f'{item:.2f}' 
                    else: 
                        s += f'{item * 100.:.2f}%'
                else: s += item
            s += '\n'
        f.write(s)
    return s

In [69]:
print(to_csv(tab, 'result_avg.csv'))

,gpt4,,gpt3.5,,Full Score,Allocation
Overall Score,21.74,54.36%,17.64,44.10%,40.00,
Lang: javascript,6.07,55.15%,4.13,37.58%,11.00,27.50%
Lang: python,6.02,66.89%,5.18,57.59%,9.00,22.50%
Lang: web,3.28,54.64%,2.78,46.31%,6.00,15.00%
Lang: dart,1.95,65.00%,2.12,70.56%,3.00,7.50%
Lang: mobile,0.70,23.33%,0.00,0.00%,3.00,7.50%
Lang: java,0.57,28.34%,0.53,26.25%,2.00,5.00%
Lang: sql,0.29,28.57%,0.36,35.71%,1.00,2.50%
Lang: swift,0.41,40.61%,0.21,20.84%,1.00,2.50%
Lang: go,0.35,35.00%,0.47,47.50%,1.00,2.50%
Lang: git,0.27,26.85%,0.26,26.25%,1.00,2.50%
Lang: system,1.00,100.00%,1.00,100.00%,1.00,2.50%
Lang: php,0.85,85.00%,0.60,60.00%,1.00,2.50%
Type: code debugging,8.09,42.56%,7.17,37.76%,19.00,47.50%
Type: knowledge question-answering,7.36,56.59%,5.77,44.41%,13.00,32.50%
Type: code completion,6.30,78.75%,4.69,58.67%,8.00,20.00%
Metric: keywords,20.24,72.01%,15.52,55.22%,28.10,70.25%
Metric: unit_test,5.90,100.00%,5.90,100.00%,5.90,14.75%
Metric: similarity,1.07,26.83%,0.81,20.26%,4.00,10.0