In [1]:
import os 
import json
import re
from tqdm import tqdm 

In [2]:
test_out_times_path = "/largespace/tydata/code_optimization/cpp/dataset/benchmark_gem5_testcases3/train_out_times.json"
with open(test_out_times_path, 'r') as f:
    datasets = json.load(f)

print(type(datasets))
print(len(datasets))

<class 'dict'>
2057


In [3]:
def make_pair_cross(sorted_problem_all_solutions):
    length = len(sorted_problem_all_solutions)
    pairs = []
    for i in range(length):
        for j in range(i+1, length):
            slow = sorted_problem_all_solutions[i]
            fast = sorted_problem_all_solutions[j]
            pair = (slow[0], fast[0])
            pairs.append(pair)
            
    return pairs

In [4]:
def relative_improvement(slow:float, fast:float):
    return round((slow - fast) / slow, 4)

In [8]:
all_items = []
for problem_id, problem_solutions in tqdm(datasets.items()):
    problem_all_solutions = {}
    for user_id, user_solutions in problem_solutions.items():
        for solution_id, time in user_solutions.items():
            identifier = f"{problem_id}_{user_id}_{solution_id}"
            problem_all_solutions[identifier] = time

    # for all solutions for the problem
    sorted_problem_all_solutions = sorted(problem_all_solutions.items(), key=lambda x: x[1], reverse=True) # List
    pairs = make_pair_cross(sorted_problem_all_solutions)
    
    for pair in pairs:
        slow_identifier = pair[0]
        fast_identifier = pair[1]
        slow_time = problem_all_solutions[slow_identifier]
        fast_time = problem_all_solutions[fast_identifier]
        improvement = relative_improvement(slow_time, fast_time)
        if improvement > 0.0:
            item = {
                "problem_id": problem_id,
                "slow_identifier": slow_identifier,
                "fast_identifier": fast_identifier,
                "slow_time": slow_time,
                "fast_time": fast_time,
                "improvement": improvement
            }

            all_items.append(item)

print(f"all pairs count = {len(all_items)}")
# with open("/largespace/tydata/code_optimization/cpp/dataset/by_problem/train_out_pair_improvement70.json", 'w') as writer:
#     json.dump(all_items, writer, indent=4)

100%|██████████| 2057/2057 [00:24<00:00, 85.67it/s] 

all pairs count = 15886969





## Make pair for 3 items

In [12]:
def make_pair_3item(sorted_problem_all_solutions):
    length = len(sorted_problem_all_solutions)
    pairs = []
    if length < 3:
        return pairs
    for i in range(length):
        for j in range(i+1, length):
            for k in range(j+1, length):
                slow = sorted_problem_all_solutions[i]
                current = sorted_problem_all_solutions[j]
                fast = sorted_problem_all_solutions[k]
                pair = (slow[0], current[0], fast[0])
                pairs.append(pair)
            
    return pairs

In [13]:
all_items = []
for problem_id, problem_solutions in tqdm(datasets.items()):
    problem_all_solutions = {}
    for user_id, user_solutions in problem_solutions.items():
        for solution_id, time in user_solutions.items():
            identifier = f"{problem_id}_{user_id}_{solution_id}"
            problem_all_solutions[identifier] = time

    # for all solutions for the problem
    sorted_problem_all_solutions = sorted(problem_all_solutions.items(), key=lambda x: x[1], reverse=True) # List
    pairs = make_pair_3item(sorted_problem_all_solutions)
    
    for pair in pairs:
        slow_identifier = pair[0]
        current_identifier = pair[1]
        fast_identifier = pair[2]
        slow_time = problem_all_solutions[slow_identifier]
        current_time = problem_all_solutions[current_identifier]
        fast_time = problem_all_solutions[fast_identifier]
        improvement = relative_improvement(current_time, fast_time)
        if improvement > 0.0:
            item = {
                "problem_id": problem_id,
                "slow_identifier": slow_identifier,
                "fast_identifier": fast_identifier,
                "slow_time": slow_time,
                "fast_time": fast_time,
                "improvement": improvement
            }

            all_items.append(item)

print(f"all pairs count = {len(all_items)}")

 28%|██▊       | 570/2057 [05:57<18:39,  1.33it/s]  

In [46]:
with open("/largespace/tydata/code_optimization/cpp/dataset/by_problem/train_out_pair_improvement90.json", 'r') as reader:
    all_items = json.load(reader)

print(f"all items count = {len(all_items)}")

for item in tqdm(all_items):
    problem_id = item["problem_id"]

    slow_identifier = item["slow_identifier"]
    slow_parts = slow_identifier.split("_")
    slow_user_id = slow_parts[1]
    slow_submission_id = slow_parts[2]


    fast_identifier = item["fast_identifier"]
    fast_parts = fast_identifier.split("_")
    fast_user_id = fast_parts[1]
    fast_submission_id = fast_parts[2]

    slow_cpp_file_path = os.path.join("/largespace/tydata/code_optimization/cpp/dataset/cpp_code/train", f"{problem_id}_{slow_submission_id}_{slow_user_id}.cpp")
    with open(slow_cpp_file_path, 'r') as f_slow:
        slow_cpp_content = f_slow.read()
    slow_code = slow_cpp_content

    fast_cpp_file_path = os.path.join("/largespace/tydata/code_optimization/cpp/dataset/cpp_code/train", f"{problem_id}_{fast_submission_id}_{fast_user_id}.cpp")
    with open(fast_cpp_file_path, 'r') as f_fast:
        fast_cpp_content = f_fast.read()
    fast_code = fast_cpp_content

    item["slow_code"] = slow_code
    item["fast_code"] = fast_code

with open("/largespace/tydata/code_optimization/cpp/dataset/by_problem/train_out_pair_improvement90_code.json", 'w') as writer:
    json.dump(all_items, writer, indent=4)



all items count = 14051


  0%|          | 0/14051 [00:00<?, ?it/s]

100%|██████████| 14051/14051 [00:00<00:00, 35663.81it/s]
