In [1]:
import pandas as pd
import numpy as np
import os
import json
import re
import ast
import copy

# Load Data

In [2]:
# dataset
humaneval_data_files = [e for e in os.listdir('humaneval_data/') if e.endswith('.py')]
mbpp_data_files = [e for e in os.listdir('mbpp_data/') if e.endswith('.py')]

In [3]:
# error coverage
coverage_error_note = open("coverage_note.txt").readlines()

In [4]:
# pynguin gen tests .replace('test_','')
humaneval_test_3_files = [e for e in os.listdir('humaneval_test_3/') if 'test_problem_id_' in e]
humaneval_test_5_files = [e for e in os.listdir('humaneval_test_5/') if 'test_problem_id_' in e]
humaneval_test_perfect_files = [e for e in os.listdir('humaneval_test_perfect/') if 'test_problem_id_' in e and e.endswith('.py')]

mbpp_test_3_files = [e for e in os.listdir('mbpp_test_3/') if 'test_problem_id_' in e]
mbpp_test_5_files = [e for e in os.listdir('mbpp_test_5/') if 'test_problem_id_' in e]
mbpp_test_perfect_files = [e for e in os.listdir('mbpp_test_perfect/') if 'test_problem_id_' in e and e.endswith('.py')]

test_path = ['3', '5','perfect']

In [5]:
# coverage report
coverage_humaneval_3 = json.load(open('coverage_humaneval_3.json'))
coverage_humaneval_5 = json.load(open('coverage_humaneval_5.json'))
coverage_humaneval_perfect = json.load(open('coverage_humaneval_perfect.json'))

coverage_mbpp_3 = json.load(open('coverage_mbpp_3.json'))
coverage_mbpp_5 = json.load(open('coverage_mbpp_5.json'))
coverage_mbpp_perfect = json.load(open('coverage_mbpp_perfect.json'))

In [6]:
# mutation report
mutation_humaneval_3 = open('humaneval_test_3/mutation_humaneval_3.txt').readlines()
mutation_humaneval_5 = open('humaneval_test_5/mutation_humaneval_5.txt').readlines()
mutation_humaneval_perfect = open('humaneval_test_perfect/mutation_humaneval_perfect.txt').readlines()

mutation_mbpp_3 = open('mbpp_test_3/mutation_mbpp_3.txt').readlines()
mutation_mbpp_5 = open('mbpp_test_5/mutation_mbpp_5.txt').readlines()
mutation_mbpp_perfect = open('mbpp_test_perfect/mutation_mbpp_perfect.txt').readlines()

In [7]:
humaneval_test_list = [humaneval_test_3_files, humaneval_test_5_files, humaneval_test_perfect_files]
mbpp_test_list = [mbpp_test_3_files, mbpp_test_5_files, mbpp_test_perfect_files]

humaneval_coverage_list = [coverage_humaneval_3, coverage_humaneval_5, coverage_humaneval_perfect]
mbpp_coverage_list = [coverage_mbpp_3, coverage_mbpp_5, coverage_mbpp_perfect]

humaneval_mutation_list = [mutation_humaneval_3, mutation_humaneval_5, mutation_humaneval_perfect]
mbpp_mutation_list = [mutation_mbpp_3, mutation_mbpp_5, mutation_mbpp_perfect]

test_type = ['3-test', '5-test','multi perfect']

# Generate

In [8]:
def check_generate(data_files, test_files):
    temp_test_files = [e.replace('test_','') for e in test_files]
    return round(len(set(data_files).intersection(set(temp_test_files)))/len(data_files) * 100, 2)

In [9]:
print('HumanEval Dataset')
for i, test_files in enumerate(humaneval_test_list):
    print(test_type[i], check_generate(humaneval_data_files, test_files), f'({len(test_files)}/{len(humaneval_data_files)})')
print()
print('MBPP Dataset')
for i, test_files in enumerate(mbpp_test_list):
    print(test_type[i], check_generate(mbpp_data_files, test_files), f'({len(test_files)}/{len(mbpp_data_files)})')

HumanEval Dataset
3-test 87.8 (144/164)
5-test 85.37 (140/164)
multi perfect 86.59 (142/164)

MBPP Dataset
3-test 91.44 (235/257)
5-test 90.66 (233/257)
multi perfect 89.49 (230/257)


# Compilable

In [10]:
def check_compilable(test_files, rootpath, data_files):
    n_parsed = 0
    for file in test_files:
        temp_text = open(rootpath + file).read()
        if temp_text == '':
            continue
        try:
            ast.parse(temp_text)
            n_parsed += 1
        except e:
            pass
    return round(n_parsed/len(data_files)*100, 2), n_parsed

In [11]:
print('HumanEval Dataset')
for i, test_files in enumerate(humaneval_test_list):
    score, count = check_compilable(test_files, f'humaneval_test_{test_path[i]}/', humaneval_data_files)
    print(test_type[i], score, f'(Pass AST: {count}/{len(humaneval_data_files)})')
print()
print('MBPP Dataset')
for i, test_files in enumerate(mbpp_test_list):
    score, count = check_compilable(test_files, f'mbpp_test_{test_path[i]}/', mbpp_data_files)
    print(test_type[i], score, f'(Pass AST: {count}/{len(mbpp_data_files)})')

HumanEval Dataset
3-test 87.8 (Pass AST: 144/164)
5-test 85.37 (Pass AST: 140/164)
multi perfect 86.59 (Pass AST: 142/164)

MBPP Dataset
3-test 91.44 (Pass AST: 235/257)
5-test 90.66 (Pass AST: 233/257)
multi perfect 89.49 (Pass AST: 230/257)


# Passing Test Case

## Filtered Not Pass Coverage

In [12]:
coverage_error_dict = {}
k = ""
for line in coverage_error_note:
    if line.startswith('coverage run -m '):
        k = line.split()[-1]
        coverage_error_dict[k] = set()
    elif line.startswith('FAILED '):
        temp = line.split()[1]
        temp = temp.split("::")[0]
        coverage_error_dict[k].add(temp)
print('Files with error test case(s).')
for k in coverage_error_dict:
    coverage_error_dict[k] = list(coverage_error_dict[k])
    print(f"{k}\t:\t{len(coverage_error_dict[k])}")

Files with error test case(s).
humaneval_test_perfect	:	5
humaneval_test_5	:	2
humaneval_test_3	:	3
mbpp_test_perfect	:	25
mbpp_test_5	:	27
mbpp_test_3	:	29


In [13]:
coverage_error_dict['humaneval_test_3']

['humaneval_test_3/test_problem_id_22.py',
 'humaneval_test_3/test_problem_id_16.py',
 'humaneval_test_3/test_problem_id_29.py']

In [14]:
def filter_coverage_error_files(coverage_error_list, coverage_report):
    filtered_coverage_report = copy.deepcopy(coverage_report)
    for filename in coverage_error_list:
        if filename in filtered_coverage_report['files']:
            data_filename = filename.replace('/test_', '/')
            del filtered_coverage_report['files'][data_filename]
            del filtered_coverage_report['files'][filename]
            # print('deleted')
    return filtered_coverage_report

In [15]:
filtered_coverage_humaneval_3 = filter_coverage_error_files(coverage_error_dict['humaneval_test_3'], coverage_humaneval_3)
filtered_coverage_humaneval_5 = filter_coverage_error_files(coverage_error_dict['humaneval_test_5'], coverage_humaneval_5)
filtered_coverage_humaneval_perfect = filter_coverage_error_files(coverage_error_dict['humaneval_test_perfect'], coverage_humaneval_perfect)

filtered_coverage_mbpp_3 = filter_coverage_error_files(coverage_error_dict['mbpp_test_3'], coverage_mbpp_3)
filtered_coverage_mbpp_5 = filter_coverage_error_files(coverage_error_dict['mbpp_test_5'], coverage_mbpp_5)
filtered_coverage_mbpp_perfect = filter_coverage_error_files(coverage_error_dict['mbpp_test_perfect'], coverage_mbpp_perfect)

In [16]:
humaneval_filtered_coverage_list = [filtered_coverage_humaneval_3, filtered_coverage_humaneval_5, filtered_coverage_humaneval_perfect]
mbpp_filtered_coverage_list = [filtered_coverage_mbpp_3, filtered_coverage_mbpp_5, filtered_coverage_mbpp_perfect]

## Filter Not Pass Mutation

In [17]:
 # Mutation (MutPy) won't run when any test cases fail, so no need to filter.

# Coverage Score

In [18]:
def calculate_coverage(coverage_report, data_files):
    filter_files = [f for f in list(coverage_report['files']) if 'test_problem_id_' not in f and f.endswith('.py')]
    count = len(filter_files)
    coverage_score = []
    for file in filter_files:
        coverage_score.append(coverage_report['files'][file]['summary']['percent_covered'])
    # print(coverage_score)
    return round(sum(coverage_score) / len(data_files), 2), count

In [19]:
print('HumanEval Dataset')
for i, test_files in enumerate(humaneval_coverage_list):
    score, count = calculate_coverage(test_files, humaneval_data_files)
    print(test_type[i], score, f'(Available: {count}/{len(humaneval_data_files)}={round(count/len(humaneval_data_files)*100,2)})')
print()
print('MBPP Dataset')
for i, test_files in enumerate(mbpp_coverage_list):
    score, count = calculate_coverage(test_files, mbpp_data_files)
    print(test_type[i], score, f'(Available: {count}/{len(mbpp_data_files)}={round(count/len(mbpp_data_files)*100,2)})')

HumanEval Dataset
3-test 87.04 (Available: 144/164=87.8)
5-test 84.65 (Available: 140/164=85.37)
multi perfect 85.91 (Available: 142/164=86.59)

MBPP Dataset
3-test 87.03 (Available: 235/257=91.44)
5-test 86.79 (Available: 233/257=90.66)
multi perfect 84.64 (Available: 230/257=89.49)


In [20]:
print('HumanEval Dataset')
for i, test_files in enumerate(humaneval_filtered_coverage_list):
    score, count = calculate_coverage(test_files, humaneval_data_files)
    print(test_type[i], score, f'(Available: {count}/{len(humaneval_data_files)}={round(count/len(humaneval_data_files)*100,2)})')
print()
print('MBPP Dataset')
for i, test_files in enumerate(mbpp_filtered_coverage_list):
    score, count = calculate_coverage(test_files, mbpp_data_files)
    print(test_type[i], score, f'(Available: {count}/{len(mbpp_data_files)}={round(count/len(mbpp_data_files)*100,2)})')

HumanEval Dataset
3-test 85.21 (Available: 141/164=85.98)
5-test 83.43 (Available: 138/164=84.15)
multi perfect 82.87 (Available: 137/164=83.54)

MBPP Dataset
3-test 76.67 (Available: 206/257=80.16)
5-test 77.0 (Available: 206/257=80.16)
multi perfect 75.86 (Available: 205/257=79.77)


# Mutation Score

In [21]:
def calculate_mutation(mutation_report, data_files):
    regex_pattern = r'[\n\r]*Mutation score.*: [ \t]*([^\n\r]*)' 
    mutation_score = []
    count = 0
    for line in mutation_report:
        if '[*] Mutation score' in line:
            mutation_score.append(float(re.findall(regex_pattern, line)[0][:-1]))
            count += 1
    # print(mutation_score)
    return round(sum(mutation_score) / len(data_files), 2), count

In [45]:
def check_mutation_operator(mutation_report):
    regex_pattern = r'   - [[\[#   \d\]]+([^\n]*) problem_id_'
    mutation_operator_dict = {}
    mutation_report_full = "\n".join(mutation_report)
    mutation_operator = re.findall(regex_pattern, mutation_report_full)
    for op in mutation_operator:
        if op not in mutation_operator_dict:
            mutation_operator_dict[op] = 0
        mutation_operator_dict[op] += 1
    mutation_operator_dict = dict(sorted(mutation_operator_dict.items(), key=lambda item: item[1], reverse=True))
    return mutation_operator_dict

In [46]:
print('HumanEval Dataset')
for i, test_files in enumerate(humaneval_mutation_list):
    score, count = calculate_mutation(test_files, humaneval_data_files)
    print(test_type[i], score, f'(Available: {count}/{len(humaneval_data_files)}={round(count/len(humaneval_data_files)*100,2)})')
print()
print('MBPP Dataset')
for i, test_files in enumerate(mbpp_mutation_list):
    score, count = calculate_mutation(test_files, mbpp_data_files)
    print(test_type[i], score, f'(Available: {count}/{len(mbpp_data_files)}={round(count/len(mbpp_data_files)*100,2)})')

HumanEval Dataset
3-test 23.61 (Available: 140/164=85.37)
5-test 21.45 (Available: 138/164=84.15)
multi perfect 25.59 (Available: 136/164=82.93)

MBPP Dataset
3-test 19.02 (Available: 206/257=80.16)
5-test 21.71 (Available: 206/257=80.16)
multi perfect 20.71 (Available: 205/257=79.77)


In [47]:
print('HumanEval Dataset')
for i, test_files in enumerate(humaneval_mutation_list):
    mutation_operator = check_mutation_operator(test_files)
    print(test_type[i], mutation_operator)
print()
print('MBPP Dataset')
for i, test_files in enumerate(mbpp_mutation_list):
    mutation_operator = check_mutation_operator(test_files)
    print(test_type[i], mutation_operator)

HumanEval Dataset
3-test {'AOR': 365, 'ROR': 225, 'COI': 151, 'ASR': 59, 'SIR': 39, 'AOD': 37, 'LCR': 24, 'COD': 12, 'BCR': 2, 'EHD': 1}
5-test {'AOR': 355, 'ROR': 224, 'COI': 149, 'ASR': 58, 'SIR': 39, 'AOD': 37, 'LCR': 23, 'COD': 12, 'BCR': 2, 'EHD': 1}
multi perfect {'AOR': 348, 'ROR': 211, 'COI': 144, 'ASR': 62, 'SIR': 39, 'AOD': 37, 'LCR': 22, 'COD': 12, 'BCR': 2, 'EHD': 1}

MBPP Dataset
3-test {'AOR': 611, 'ROR': 233, 'COI': 161, 'ASR': 51, 'SIR': 34, 'LOR': 32, 'AOD': 27, 'LCR': 22, 'BCR': 10, 'COD': 9, 'EHD': 1, 'EXS': 1}
5-test {'AOR': 597, 'ROR': 234, 'COI': 161, 'ASR': 51, 'SIR': 34, 'LOR': 32, 'AOD': 27, 'LCR': 22, 'BCR': 10, 'COD': 8, 'EHD': 1, 'EXS': 1}
multi perfect {'AOR': 588, 'ROR': 226, 'COI': 155, 'ASR': 48, 'SIR': 34, 'LOR': 32, 'AOD': 27, 'LCR': 21, 'BCR': 10, 'COD': 9, 'EHD': 1, 'EXS': 1}
