In [None]:
from datasets import load_dataset
import os
import json

train_dataset = load_dataset("KodCode/KodCode-V1", split="train")
print("Training set:", train_dataset)


## Filter Rules

`style`: instruct

`subset`: Leetcode, Codeforces, Code Contests, Taco, Apps

`GPT4o Pass Count`: < 9

`Benchmark Similarity`: < 0.9

Test count: >= 4

In [None]:
# Filter the dataset to include only specific styles
filtered_train_dataset = train_dataset.filter(lambda x: x['subset'] in ['Leetcode', 'Codeforces', 'Code_Contests', 'Apps', 'Taco'])
# Filter for 'instruct' style
instruct_dataset = filtered_train_dataset.filter(lambda x: x['style'] == 'instruct')
# Filter for gpt_pass_trial_num < 10
low_trials_dataset = instruct_dataset.filter(lambda x: x['gpt_pass_trial_num'] < 9)
# Filter for benchmark similarity < 0.9
high_quality_dataset = low_trials_dataset.filter(lambda x: x['benchmark_similarity'] < 0.9)
# Filter for test codes with >= 4 'def' occurrences
def_filtered_dataset = high_quality_dataset.filter(lambda x: x['test_code'].count('def') >= 8)

## Bad Data Removal

In [None]:
bad_ids = ["Codeforces_12376_I"]
error_filtered_dataset = def_filtered_dataset.filter(lambda x: x['question_id'] not in bad_ids)

In [None]:
def format_test_info(test_info):
    """
    Format test_info into Python starter code with docstring and function declaration.
    
    Args:
        test_info: List of test info dictionaries containing function declaration and docstring
        
    Returns:
        Formatted Python starter code string
    """
    # Return empty if no test info
    if not test_info:
        return ""
        
    # Get the function declaration and docstring from first test info
    func_dec = test_info[0]['function_declaration']
    
    # Format the code
    code = [func_dec]
    # Add any other function declarations from remaining test infos
    for test in test_info[1:]:
        if 'function_declaration' in test:
            code.append(test['function_declaration'])

    starter_code = "\n".join(code)

    instruction = f"Use the following starter code for your solution:\n\n{starter_code}\n"
        
    return instruction

inst = format_test_info(error_filtered_dataset[66]['test_info'])
print(inst)

In [None]:
def extract_imports(import_line):
    # Extract everything after "from solution import "
    if not import_line.startswith("from solution import"):
        return ""
    
    imports = import_line.replace("from solution import ", "").strip()
    
    # Convert to starter code format
    instruction = f"The code you write must contain the following functions or classes:\n{imports}\n"
            
    return instruction

# Extract imports from test code
imported_names = extract_imports('\n'.join([line for line in error_filtered_dataset[0]['test_code'].split('\n') if line.strip().startswith('from solution import')]))
print(imported_names)

In [None]:
dataset = []
for entry in error_filtered_dataset:
    tests = entry['test_code']
    solution_import = '\n'.join([line for line in tests.split('\n') if line.strip().startswith('from solution import')])
    tests = '\n'.join([line for line in tests.split('\n') if not line.strip().startswith('from solution import')])
    
    test_info = entry['test_info']
    if not test_info:
        instruction = extract_imports(solution_import)
    else:
        instruction = format_test_info(test_info)
    
    problem =  f"""
Please solve the programming task below using a self-contained code snippet in a markdown code block.
    
{entry['question'].strip()}

{instruction}
"""
    
    if len(tests) == 0:
        continue
    new_entry = {
        "problem": problem,
        "solutions": entry["solution"],
        "tests": tests,
    }
    
    dataset.append(new_entry)

print(f'Dataset size: {len(dataset)}')

output_dir = os.path.abspath("../../train/code")
output_file = os.path.join(output_dir, "kodcode.json")

with open(output_file, "w") as f:
    json.dump(dataset, f, indent=4)