In [6]:
from datasets import Dataset, concatenate_datasets
import pandas as pd

In [2]:
main = load_dataset('verifiers-for-code/merged-227k', split='train')

In [4]:
merge1 = load_dataset('verifiers-for-code/CodePython-27k-multiple-plangen', split='train')

In [7]:
# Convert Dataset to pandas DataFrame
def dataset_to_dataframe(dataset):
    return pd.DataFrame({col: dataset[col] for col in dataset.features})

# Convert pandas DataFrame back to Dataset
def dataframe_to_dataset(df):
    return Dataset.from_pandas(df)

In [14]:
# Convert to pandas DataFrames
main_df = dataset_to_dataframe(main)
merge1_df = dataset_to_dataframe(merge1)

# Merge datasets
merged = pd.merge(main_df, merge1_df, left_on=['problem', 'solution'], right_on=['input', 'code'], how='inner')

# Create additional rows for other plan types
plan_columns = [
    'non_granular_plans_Llama-3-70B',
    'granular_plans_Llama-3-70B',
    'non_granular_plans_temp_0_dot_7_llama_3_70B',
    'non_granular_plans_temp_0_llama_3_70B'
]

additional_rows = []

for _, row in merged.iterrows():
    for plan_col in plan_columns:
        new_row = row[['problem', 'solution']].copy()
        new_row['70B_plans'] = row[plan_col]
        additional_rows.append(new_row)

# Create a DataFrame from additional rows
additional_df = pd.DataFrame(additional_rows)

# Concatenate the original main dataset with the additional rows
final_df = pd.concat([main_df, additional_df], ignore_index=True)

# Convert the final DataFrame back to a Dataset
final_dataset = dataframe_to_dataset(final_df)


In [17]:
final_dataset

Dataset({
    features: ['problem', 'solution', '70B_plans'],
    num_rows: 336120
})

In [18]:
merge2 = load_dataset('verifiers-for-code/Python-Alpaca-18k-plangen', split='train')

In [19]:
merge2

Dataset({
    features: ['instruction', 'output', 'input', 'non_granular_plans_Llama_3_70B'],
    num_rows: 18534
})

In [20]:
merge2 = merge2.rename_column("input", "problem")
merge2 = merge2.rename_column("output", "solution")
merge2 = merge2.rename_column("non_granular_plans_Llama_3_70B", "70B_plans")

# Remove 'instruction' column from merge2
merge2 = merge2.remove_columns(["instruction"])

# Concatenate datasets
combined_dataset = concatenate_datasets([final_dataset, merge2])

In [26]:
combined_dataset

Dataset({
    features: ['problem', 'solution', '70B_plans'],
    num_rows: 354654
})

In [34]:
merge3 = load_dataset('verifiers-for-code/tester3', split='train')

In [35]:
merge3

Dataset({
    features: ['data_name', 'id', 'prompt', 'code', 'text', 'input', 'non_granular_plans_Llama-3-70B', 'granular_plans_Llama-3-70B', 'deepseek_plans_eval', 'deepseek_solution_eval', 'non_granular_plans_temp_0_dot_7_llama_3_70B', 'non_granular_plans_temp_0_llama_3_70B', 'structured_plans_Llama-3_1-70B'],
    num_rows: 27224
})

In [36]:
merge3 = merge3.rename_column('input', 'problem')
merge3 = merge3.rename_column('code', 'solution')
merge3 = merge3.rename_column('structured_plans_Llama-3_1-70B', '70B_plans')

# Select only the columns we need from merge3
merge3_subset = merge3.select_columns(['problem', 'solution', '70B_plans'])

# Concatenate the datasets
combined2 = concatenate_datasets([combined_dataset, merge3_subset])

In [50]:
combined2

Dataset({
    features: ['problem', 'solution', '70B_plans'],
    num_rows: 381878
})

In [51]:
merge4 = load_dataset('verifiers-for-code/cleaned_deepseek_plans', split='train')

In [52]:
merge4

Dataset({
    features: ['data_name', 'id', 'prompt', 'code', 'text', 'input', 'generated_plans_DeepSeek-Coder-V2-Instruct'],
    num_rows: 27224
})

In [53]:
merge4 = merge4.rename_column('input', 'problem')
merge4 = merge4.rename_column('code', 'solution')
merge4 = merge4.rename_column('generated_plans_DeepSeek-Coder-V2-Instruct', '70B_plans')

# Select only the columns we need from merge3
merge4_subset = merge4.select_columns(['problem', 'solution', '70B_plans'])

# Concatenate the datasets
combined3 = concatenate_datasets([combined2, merge4_subset])

In [54]:
combined3

Dataset({
    features: ['problem', 'solution', '70B_plans'],
    num_rows: 409102
})

In [55]:
merge5 = load_dataset('verifiers-for-code/sampled_10k_from_227k', split='train')

Downloading readme: 100%|██████████| 481/481 [00:00<00:00, 1.59MB/s]
Downloading data: 100%|██████████| 41.3M/41.3M [00:03<00:00, 11.0MB/s]
Generating train split: 100%|██████████| 10000/10000 [00:00<00:00, 26642.17 examples/s]


In [56]:
merge5

Dataset({
    features: ['problem', 'solution', 'gpt-4o-mini-plans', 'text', 'text_gemma', 'text_nosys_phi'],
    num_rows: 10000
})

In [57]:
merge5 = merge5.rename_column('gpt-4o-mini-plans', '70B_plans')

# Select only the columns we need from merge3
merge5_subset = merge5.select_columns(['problem', 'solution', '70B_plans'])

# Concatenate the datasets
combined4 = concatenate_datasets([combined3, merge4_subset])

In [58]:
combined4

Dataset({
    features: ['problem', 'solution', '70B_plans'],
    num_rows: 436326
})

In [60]:
combined4.push_to_hub("verifiers-for-code/merged")

Creating parquet from Arrow format: 100%|██████████| 146/146 [00:00<00:00, 171.32ba/s]
Creating parquet from Arrow format: 100%|██████████| 146/146 [00:00<00:00, 147.36ba/s]
Creating parquet from Arrow format: 100%|██████████| 146/146 [00:02<00:00, 71.70ba/s]
Uploading the dataset shards: 100%|██████████| 3/3 [00:47<00:00, 15.85s/it]


CommitInfo(commit_url='https://huggingface.co/datasets/verifiers-for-code/merged/commit/5342cf07ac7d04b0b66ce5d11a9c02e44cf57402', commit_message='Upload dataset', commit_description='', oid='5342cf07ac7d04b0b66ce5d11a9c02e44cf57402', pr_url=None, pr_revision=None, pr_num=None)