In [None]:
import os
import subprocess

DATASETS_ROOT = "arcade_nl2code/annotated_dataset/dataset"

dataset_src = "existing_tasks"
trunccate_path = "dataset.schema.originating_dfs.header_description.after_variable_cell.maxp900.maxp_no_prefix-1.maxctxcell-1.truncate_metadata.json"
max_prompt = 4000
schema_repr = "originating_dfs.header_description.after_variable_cell"

dataset_path = os.path.join(DATASETS_ROOT, dataset_src, "dataset.json")
folder_path = os.path.join(DATASETS_ROOT, dataset_src, "derived_datasets/")
artifacts_path = os.path.join(DATASETS_ROOT, dataset_src, "artifacts/")
truncate_metadata_path = os.path.join(folder_path, trunccate_path)

command = [
    "faketime", "2022-12-10 12:00:00",
    "python", "-m",
    "arcade_nl2code.annotated_dataset.generate_schema_augmented_prompts",
    "--dataset", dataset_path,
    "--output_folder", folder_path,
    "--runtime_artifacts_root", artifacts_path,
    "--schema_representation_method", schema_repr,
    "--max_prompt_size", str(max_prompt),
    "--truncate_metadata_path", truncate_metadata_path,
    "--ignore_errors"
]

subprocess.run(command, check=True)

In [None]:
import os
import subprocess

def generate_dataset(
    dataset_name,
    dataset_src, 
    add_exemplars=False
    max_prompt_size = 4000
    max_notebook_context_length = 3000,
    prompt_style="vanilla"
):    
    DATASETS_ROOT = "arcade_nl2code/annotated_dataset/dataset"
    schema_repr_method = "originating_dfs.header_description.after_variable_cell"

    dataset_path = os.path.join(DATASETS_ROOT, dataset_src, "dataset.json")
    folder_path = os.path.join("datasets", dataset_src, "derived_datasets/")
    artifacts_path = os.path.join(DATASETS_ROOT, dataset_src, "artifacts/")

    prompt_styles = {
        "vanilla": "short_code_no_preamble", 
        "step_by_step": "step_only_no_preamble", 
        "step_by_step+preamble": "step_only", 
        "step_by_step+preamble+explanation": "step+explanation"
    }

    file_name_prefix=f"dataset.{dataset_name}"

    dataset_file_name=f"{file_name_prefix}.{prompt_style}.json"
    cmd = [
        "faketime", "2022-12-10 12:00:00", 
        "python", "-m",
        "arcade_nl2code.annotated_dataset.generate_schema_augmented_prompts",
        "--dataset", dataset_path,
        "--output_folder", folder_path,
        "--output_dataset_name", dataset_file_name,
        "--runtime_artifacts_root", artifacts_path,
        "--schema_representation_method", schema_repr_method,
        "--max_prompt_size", str(max_prompt_size),
    ]

    if add_exemplars:
        cmd.extend([
            "--max_notebook_context_len", str(max_notebook_context_length),
            "--add_exemplars",
            "--exemplar_notebook", "arcade_nl2code/annotated_dataset/resources/prompt_exemplar_templates.ipynb",
            "--format_configs", prompt_styles[prompt_style],
            "--exemplar_index", "0,1,2,3,4,5"
        ])
        

    subprocess.run(cmd)


In [None]:
generate_dataset("existing_tasks", add_exemplars=True, prompt_style="step_by_step+preamble+explanation", exemplar_index_str="0,1,4,5")

In [None]:
DATASETS_ROOT = "arcade_nl2code/annotated_dataset/dataset"

dataset_src = "existing_tasks"
max_prompt_size = 6000
schema_repr_method = "originating_dfs.header_description.after_variable_cell"

dataset_path = os.path.join(DATASETS_ROOT, dataset_src, "dataset.json")
folder_path = os.path.join("datasets", dataset_src, "derived_datasets/")
artifacts_path = os.path.join(DATASETS_ROOT, dataset_src, "artifacts/")
truncate_metadata_path = os.path.join(folder_path, trunccate_path)


for exemplar_index in ["1,2,3,5", "1,3,4,5", "0,1,4,5"]:
    exemplar_index_string = exemplar_index.replace(",", "_")

    config_names = ["short_code_no_preamble", "step_only_no_preamble", "step_only", "step+explanation"]
    prompt_styles = ["vanilla_prompting", "step_by_step", "step_by_step+preamble", "step_by_step+preamble+explanation"]

    for i in range(len(config_names)):
        config_name = config_names[i]
        prompt_style = prompt_styles[i]

        subprocess.run([
            "faketime", "2022-12-10 12:00:00", "python", "-m",
            "arcade_nl2code.annotated_dataset.generate_schema_augmented_prompts",
            "--dataset", dataset_path,
            "--output_folder", folder_path,
            "--runtime_artifacts_root", artifacts_path,
            "--schema_representation_method", schema_repr_method,
            "--max_prompt_size", str(max_prompt_size),
            "--add_exemplars",
            "--exemplar_notebook", "arcade_nl2code/annotated_dataset/resources/prompt_exemplar_templates.ipynb",
            "--format_configs", config_name,
            "--exemplar_index", exemplar_index,
        ])


In [None]:
from src.experiments import generate_dataset

# generate_dataset("vanilla_default")

In [None]:
config = {
    "name":"vanilla_default_llama",
    "dataset": {
        "dataset_name": "vanilla_default",
        "add_exemplars": False,
        "max_prompt_size": 900,
        "max_notebook_context_length": 1200,
        "prompt_style": "vanilla"
    },
    "models": ["llama3-70b"]
}

generate_dataset(**config["dataset"])

In [None]:
cp -r arcade_nl2code/annotated_dataset/dataset/existing_tasks/artifacts/ artifacts/
cp -r arcade_nl2code/annotated_dataset/dataset/new_tasks/artifacts/ artifacts/

import os
import json

DATASETS_DIR = "arcade_nl2code/annotated_dataset/dataset/"

dataset = []

for dataset_src in ["existing_tasks", "new_tasks"]:
    temp_dataset_path = os.path.join(DATASETS_DIR, dataset_src, "dataset.json")
    with open(temp_dataset_path, "r") as f:
        data = json.loads(f.read())
    dataset.extend(list(map(lambda n: n | {"dataset_src": dataset_src}, data)))

    with open(os.path.join("datasets", "dataset.json"), "w") as f:
        f.write(json.dumps(dataset, indent=2))

In [None]:
import os
import json

DATASETS_DIR = "../datasets/arcade"

for d in [".arcade.few_shot.cot.explanations.json", ".arcade.few_shot.cot.json", ".arcade.few_shot.vanilla.json"]:
    dataset = []
    for dataset_src in ["existing_tasks", "new_tasks"]:
        dp = f"dataset.{dataset_src}{d}"
        temp_dataset_path = os.path.join(DATASETS_DIR, dp)
        with open(temp_dataset_path, "r") as f:
            data = json.loads(f.read())
        dataset.extend(list(map(lambda n: n | {"dataset_src": dataset_src}, data)))

    with open(os.path.join("../datasets", f"dataset{d}"), "w") as f:
        f.write(json.dumps(dataset, indent=2))