In [62]:
from datasets import load_dataset, Dataset
import tiktoken
from collections import Counter
import random

In [63]:
df = load_dataset('verifiers-for-code/cleaned_deepseek_plans', split='train')

In [64]:
enc = tiktoken.encoding_for_model("gpt-4")

In [65]:
def token_count(text):
    return len(enc.encode(text))

def get_unique_random_sample_indices(dataset, n, max_problem_tokens, max_solution_tokens):
    # Get all unique problems and their indices, filtering by token count for both problem and solution
    unique_problems = {}
    for i, (problem, solution) in enumerate(zip(dataset['input'], dataset['code'])):
        if (problem not in unique_problems and 
            token_count(problem) < max_problem_tokens and 
            token_count(solution) < max_solution_tokens):
            unique_problems[problem] = i
    
    # If we have fewer eligible unique problems than requested, return all of them
    if len(unique_problems) <= n:
        return list(unique_problems.values())
    
    # Otherwise, return indices for a random sample
    sampled_problems = random.sample(list(unique_problems.keys()), n)
    return [unique_problems[problem] for problem in sampled_problems]

# Get indices for up to 10,000 unique random samples, 
# each with problem under 512 tokens and solution under 1024 tokens
sampled_indices = get_unique_random_sample_indices(df, 10000, 512, 1024)

# Create a new dataset with the sampled rows, keeping all columns
result_dataset = df.select(sampled_indices)

In [66]:
result_dataset

Dataset({
    features: ['data_name', 'id', 'prompt', 'code', 'text', 'input', 'generated_plans_DeepSeek-Coder-V2-Instruct'],
    num_rows: 10000
})

In [67]:
topush = result_dataset.remove_columns(['data_name', 'id', 'prompt', 'text', 'generated_plans_DeepSeek-Coder-V2-Instruct'])

In [68]:
topush

Dataset({
    features: ['code', 'input'],
    num_rows: 10000
})

In [69]:
new_dataset = Dataset.from_dict({
    'input': topush['input'],
    'code': topush['code']
})

In [70]:
new_dataset

Dataset({
    features: ['input', 'code'],
    num_rows: 10000
})

In [71]:
new_dataset.push_to_hub('verifiers-for-code/sampled_10k_from_27k')

Creating parquet from Arrow format: 100%|██████████| 10/10 [00:00<00:00, 89.88ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00,  1.42s/it]


CommitInfo(commit_url='https://huggingface.co/datasets/verifiers-for-code/sampled_10k_from_27k/commit/84a9d4c22b45f9b21f900856a8441e8111bd96dc', commit_message='Upload dataset', commit_description='', oid='84a9d4c22b45f9b21f900856a8441e8111bd96dc', pr_url=None, pr_revision=None, pr_num=None)