In [None]:
from src.llm import generate_response
from src.experiments import run_experiment, evaluate_experiment, generate_results


### Temperature Sampling

In [None]:

generate_fn = lambda t: [generate_response("meta-llama/Llama-3-70b-chat-hf", t.input, i * 0.4) for i in range(3)]

config = {
    "name":"temperature_sampling_test",
    "dataset": "dataset.+schema.originating_dfs.header_description.after_variable_cell.maxp6000.maxp_no_prefix-1.maxctxcell-1.schema_only.json",
    "dataset_src": "existing_tasks"
}

run_experiment(generate_fn, config)

In [None]:
evaluate_experiment(config)

In [None]:
from src.experiments import generate_additional_metrics

config = {
    "name":"temperature_sampling_test",
    "dataset": "dataset.+schema.originating_dfs.header_description.after_variable_cell.maxp6000.maxp_no_prefix-1.maxctxcell-1.schema_only.json",
    "dataset_src": "existing_tasks"
}

generate_additional_metrics(config)

In [None]:
import os
import pandas as pd

config = {
    "name":"temperature_sampling_test",
    "dataset": "dataset.+schema.originating_dfs.header_description.after_variable_cell.maxp6000.maxp_no_prefix-1.maxctxcell-1.schema_only.json",
    "dataset_src": "existing_tasks"
}

EXPERIMENTS_ROOT = "experiments"
DATASETS_ROOT = "arcade_nl2code/annotated_dataset/dataset"

def generate_results_df(config):
    current_dir = os.getcwd()
    folder_path = os.path.join(current_dir, EXPERIMENTS_ROOT, config['name'])
    predictions_path = os.path.join(folder_path, "predictions.json")
    metadata_path = os.path.join(current_dir, "metadata", config["dataset_src"], "tasks.csv")
    output_path = os.path.join(folder_path, "results.csv")

    with open(predictions_path, "r") as f:
        predictions = json.loads(f.read())

    tasks = []

    for notebook in predictions:
        for i, turn in enumerate(notebook["turns"]):
            task = {}

            preds = turn["predictions"]
            ref = turn["metadata"]["example"]["turn"]["code"]["value"]
            codebleu_scores = [calc_codebleu([ref], [p], lang="python") for p in preds]
            
            task["notebook_name"] = notebook["metadata"]["notebook_name"]
            task["turn_index"] = i
            task["predictions"] = turn["predictions"]
            task["reference"] = turn["metadata"]["example"]["turn"]["code"]["value"]
            task["results"] = [e | s for e, s in zip(turn["eval_results"], codebleu_scores)]
            tasks.append(task)
    
    df_tasks = pd.DataFrame(tasks)
    df_metadata = pd.read_csv(metadata_path)
    df = df_tasks.merge(df_metadata, on=['notebook_name', 'turn_index'])
    df.to_csv(output_path, encoding='utf-8', index=False)

generate_results_df(config)

In [None]:


config = {
    name: "basic_test_experiment",
    dataset_name: 
    model_name: "llama70b"
}

In [None]:
datasets = {
    "vanilla_default": {
        "dataset_name": "vanilla_default",
        "add_exemplars": False,
        "max_prompt_size": 900,
        "max_notebook_context_length": 1200,
        "prompt_style": "vanilla"
    }
}

In [None]:
class LLM(Enum):
    LLAMA_8B
    LLAMA_70B
    DEEPSEEK_33B
    GPT4

In [None]:
from src.llm import generate_response
from src.experiments import run_experiment

temperatures = [0.0, 0.2, 0.4, 0.6, 0.8]

def generate_fn(model, task):
    return [generate_response(model, task.input, t) for t in temperatures]

config = { 
    "name":"vanilla_baseline",
    "generate_fn": generate_fn,
    "models": ["LLAMA3_INSTRUCT_8B", "LLAMA3_INSTRUCT_70B"],
    "dataset": {
        "dataset_name": "vanilla_default",
        "add_exemplars": False,
        "max_prompt_size": 900,
        "max_notebook_context_length": 1200,
        "prompt_style": "vanilla"
    },
    "metadata":{
        "temperatures": temperatures
    }
}

run_experiment(config)

In [None]:
from src.llm import generate_response
from src.experiments import run_experiment

config = {
    "name":"vanilla_baseline",
    "dataset": {
        "dataset_name": "vanilla_default",
        "add_exemplars": False,
        "max_prompt_size": 900,
        "max_notebook_context_length": 1200,
        "prompt_style": "vanilla"
    },
    "models": ["llama3-70b"]
}

run_experiment(config)

In [None]:
from src.experiments import generate_dataset

config = { 
    "name":"vanilla_exemplars",
    # "generate_fn": generate_fn,
    "models": ["LLAMA3_INSTRUCT_8B", "LLAMA3_INSTRUCT_70B", "DEEPSEEK_CODER_33B"],
    "dataset": {
        "dataset_name": "vanilla_exemplars",
        "add_exemplars": True,
        "max_prompt_size": 2100,
        "max_notebook_context_length": 1200,
        "prompt_style": "vanilla"
    },
    "metadata":{
        # "temperatures": temperatures
    }
}

generate_dataset(**config["dataset"])

In [None]:
from src.experiments import run_experiment

config = { 
    "name":"vanilla_baseline_no_max_token",
    "models": ["LLAMA3_INSTRUCT_8B", "LLAMA3_INSTRUCT_70B"],
    "dataset": {
        "dataset_name": "vanilla_default"
    }
}

run_experiment(config)

In [None]:
from src.llm import generate_response
generate_response("LLAMA3_INSTRUCT_70B", "hello there", 0.2)

In [None]:
from concurrent import futures

def generate_fn(model):
    return [(generate_response, model, "hello there", 0.2) for i in range(25)]


with futures.ThreadPoolExecutor(10) as executor:
    fs = [executor.submit(*args) for args in generate_fn("LLAMA3_INSTRUCT_70B")]
    code_strs = [f.result() for f in futures.as_completed(fs)]
    print(code_strs)