In [None]:
from datasets import load_dataset

train_dataset = load_dataset("KodCode/KodCode-V1", split="train")
print("Training set:", train_dataset)


## Filter Rules

`style`: instruct

`subset`: Leetcode, Codeforces, Code Contests, Taco, Apps

`GPT4o Pass Count`: < 9

`Benchmark Similarity`: < 0.9

Test count: >= 4

In [None]:
# Filter the dataset to include only specific styles
filtered_train_dataset = train_dataset.filter(lambda x: x['subset'] in ['Leetcode', 'Codeforces', 'Code_Contests', 'Apps', 'Taco'])
# Filter for 'instruct' style
instruct_dataset = filtered_train_dataset.filter(lambda x: x['style'] == 'instruct')
# Filter for gpt_pass_trial_num < 10
low_trials_dataset = instruct_dataset.filter(lambda x: x['gpt_pass_trial_num'] < 9)
# Filter for benchmark similarity < 0.9
high_quality_dataset = low_trials_dataset.filter(lambda x: x['benchmark_similarity'] < 0.9)
# Filter for test codes with >= 4 'def' occurrences
def_filtered_dataset = high_quality_dataset.filter(lambda x: x['test_code'].count('def') >= 8)

In [None]:
import os
import json

error_path = "/home/roy/rllm/tests/rllm/rewards/kodcode_test_err_copy.json"
with open(error_path, "r") as f:
    error_data = json.load(f)
error_prob = [entry['problem'] for entry in error_data]

dataset = []
for entry in def_filtered_dataset:
    tests = entry['test_code']
    tests = '\n'.join([line for line in tests.split('\n') if not line.strip().startswith('from solution import')])
    
    problem =  f"Please solve the programming task below using a self-contained code snippet in a markdown code block.\n\n{entry['question'].strip()}"
    if len(tests) == 0:
        continue
    new_entry = {
        "problem": problem,
        "solutions": entry["solution"],
        "tests": tests,
    }
    if problem not in error_prob:
        dataset.append(new_entry)

print(f'Dataset size: {len(dataset)}')


output_dir = os.path.abspath("../../train/code")
output_file = os.path.join(output_dir, "kodcode.json")

with open(output_file, "w") as f:
    json.dump(dataset, f, indent=4)