In [5]:
import yaml
from matplotlib import pyplot as plt

def compute_stats(file_path: str):
    with open(file_path, 'r') as f:
        full_results = yaml.load(f, yaml.Loader)
    peritem_full_score = {}
    peritem_get_score = {}
    peritem_freq = {}
    perlang_full_score = {}
    perlang_get_score = {}
    perlang_freq = {}
    perarea_full_score = {}
    perarea_get_score = {}
    perarea_freq = {}
    tot_score = 0.
    full_score = 0.
    
    fields = ['keywords', 'blank_filling', 'unit_test', 'similarity', 'customized']
    
    for case_name, case_report in full_results.items():
        case_full_score = case_report['full_score']
        
        # locate the instance with highest score
        best_score = 0.
        best_response_peritem_full_score = {}
        best_response_peritem_get_score = {}
        for response_report in case_report['detail']:
            now_score = 0.
            response_peritem_full_score = {}
            response_peritem_get_score = {}
            for field in fields:
                if field + '_score' in response_report:
                    now_score += response_report[field + '_score']
                    response_peritem_full_score[field] = response_report[field + '_totscore']
                    response_peritem_get_score[field] = response_report[field + '_score']
            if now_score >= best_score:
                best_score = now_score
                best_response_peritem_full_score = response_peritem_full_score
                best_response_peritem_get_score = response_peritem_get_score
        
        tot_case_score = sum(best_response_peritem_full_score.values())
        for item in best_response_peritem_full_score:
            best_response_peritem_full_score[item] *= case_full_score / tot_case_score
            best_response_peritem_get_score[item] *= case_full_score / tot_case_score
        
        for item in best_response_peritem_full_score:
            if item not in peritem_get_score:
                peritem_get_score[item] = 0.
                peritem_full_score[item] = 0.
                peritem_freq[item] = 0.
            peritem_get_score[item] += best_response_peritem_get_score[item]
            peritem_full_score[item] += best_response_peritem_full_score[item]
            peritem_freq[item] += 1
        
        now_score = case_report['now_score']
        now_lang = case_report['lang']
        now_area = case_report['type']
        
        tot_score += now_score
        full_score += case_report['full_score']
        
        if now_lang not in perlang_get_score:
            perlang_get_score[now_lang] = 0.
            perlang_full_score[now_lang] = 0.
            perlang_freq[now_lang] = 0.
        
        perlang_get_score[now_lang] += now_score
        perlang_full_score[now_lang] += case_full_score
        perlang_freq[now_lang] += 1.
        
        if now_area not in perarea_get_score:
            perarea_get_score[now_area] = 0.
            perarea_full_score[now_area] = 0.
            perarea_freq[now_area] = 0.
        
        perarea_get_score[now_area] += now_score
        perarea_full_score[now_area] += case_full_score
        perarea_freq[now_area] += 1.
        
    langs = list(perlang_get_score.keys())
    areas = list(perarea_get_score.keys())
    
    return {
        'tot_score': tot_score,
        'tot_full_score': full_score,
        'lang': langs,
        'area': areas,
        'field': fields,
        'perarea_freq': perarea_freq,
        'perarea_get_score': perarea_get_score,
        'perarea_full_score': perarea_full_score,
        'perlang_freq': perlang_freq,
        'perlang_get_score': perlang_get_score,
        'perlang_full_score': perlang_full_score,
        'peritem_freq': peritem_freq,
        'peritem_get_score': peritem_get_score,
        'peritem_full_score': peritem_full_score
    }


In [7]:
compute_stats('results/suite_v1_gpt-4_0.2_0.9_10.yaml')

{'tot_score': 28.792113504566615,
 'tot_full_score': 40.0,
 'lang': ['web',
  'python',
  'javascript',
  'dart',
  'sql',
  'swift',
  'go',
  'java',
  'mobile',
  'git',
  'system',
  'php'],
 'area': ['code debugging', 'code completion', 'knowledge question-answering'],
 'field': ['keywords',
  'blank_filling',
  'unit_test',
  'similarity',
  'customized'],
 'perarea_freq': {'code debugging': 19.0,
  'code completion': 8.0,
  'knowledge question-answering': 13.0},
 'perarea_get_score': {'code debugging': 11.119047619047619,
  'code completion': 7.5,
  'knowledge question-answering': 10.173065885519},
 'perarea_full_score': {'code debugging': 19.0,
  'code completion': 8.0,
  'knowledge question-answering': 13.0},
 'perlang_freq': {'web': 6.0,
  'python': 9.0,
  'javascript': 11.0,
  'dart': 3.0,
  'sql': 1.0,
  'swift': 1.0,
  'go': 1.0,
  'java': 2.0,
  'mobile': 3.0,
  'git': 1.0,
  'system': 1.0,
  'php': 1.0},
 'perlang_get_score': {'web': 4.5,
  'python': 7.6,
  'javascript':

In [8]:
compute_stats('results/suite_v1_gpt-3.5-turbo_0.2_0.9_10.yaml')

{'tot_score': 22.905522818566297,
 'tot_full_score': 40.0,
 'lang': ['web',
  'python',
  'javascript',
  'dart',
  'sql',
  'swift',
  'go',
  'java',
  'mobile',
  'git',
  'system',
  'php'],
 'area': ['code debugging', 'code completion', 'knowledge question-answering'],
 'field': ['keywords',
  'blank_filling',
  'unit_test',
  'similarity',
  'customized'],
 'perarea_freq': {'code debugging': 19.0,
  'code completion': 8.0,
  'knowledge question-answering': 13.0},
 'perarea_get_score': {'code debugging': 9.583333333333334,
  'code completion': 6.5,
  'knowledge question-answering': 6.822189485232963},
 'perarea_full_score': {'code debugging': 19.0,
  'code completion': 8.0,
  'knowledge question-answering': 13.0},
 'perlang_freq': {'web': 6.0,
  'python': 9.0,
  'javascript': 11.0,
  'dart': 3.0,
  'sql': 1.0,
  'swift': 1.0,
  'go': 1.0,
  'java': 2.0,
  'mobile': 3.0,
  'git': 1.0,
  'system': 1.0,
  'php': 1.0},
 'perlang_get_score': {'web': 2.928571428571429,
  'python': 6.5,
