In [2]:
import os
import json

### Formatting the trainset output

In [3]:

def modify_json_file(input_file_path, output_file_path=None):
    # If output_file_path is not provided, generate one
    if output_file_path is None:
        file_name, file_extension = os.path.splitext(input_file_path)
        output_file_path = f"{file_name}_modified_new{file_extension}"

    # Read the JSON file
    with open(input_file_path, 'r') as file:
        data = json.load(file)
    
    # Check if the loaded data is a list of dictionaries
    if isinstance(data, list):
        for item in data:
            if isinstance(item, dict) and 'steps' in item:
                # Modify the first step in each dictionary
                if item['steps'] and isinstance(item['steps'][0], str) and item['steps'][0].startswith('search['):
                    item['steps'][0] = 'I should first generate a search action based on the instruction'
    
    # If the loaded data is a single dictionary
    elif isinstance(data, dict) and 'steps' in data:
        if data['steps'] and isinstance(data['steps'][0], str) and data['steps'][0].startswith('search['):
            data['steps'][0] = 'I should first generate a search action based on the instruction'
    
    # Write the modified data to the new file
    with open(output_file_path, 'w') as file:
        json.dump(data, file, indent=2)
    
    print(f"Modified data has been saved to: {output_file_path}")

In [None]:
modify_json_file("json file from trainset") # change this to the json file from the webshop_trainset_building.py output

In [None]:
with open('the modified data','r') as f:  # change this to the modified data file from last step
    data_test = json.load(f)



def transform_dict_to_list(input_dict):
    """
    Coverting the format to the training format
    """
    result = []
    problem_text = input_dict['problem']
    steps = input_dict['steps']
    models = input_dict['models']

    # Create the allSubtask string
    all_subtask = "; ".join([f"step{i+1}: {step}" for i, step in enumerate(steps)])

    for i, (step, model) in enumerate(zip(steps, models)):
        new_dict = {
            "problemText": problem_text,
            "allSubtask": all_subtask,
            "nowSubtask": f"step{i+1}: {step}",
            "difficultyNum": 0 if 'llama3' in model else 1
        }
        result.append(new_dict)

    return result

# transform every dictionary in the list to a list of dictionaries
transformed_data = []
for item in data_test:
    transformed_data.extend(transform_dict_to_list(item))

# Save the transformed data to a new file
output_file_path = 'transformed_data.json'
with open(output_file_path, 'w') as file:
    json.dump(transformed_data, file, indent=2)


### Converting the model allocation result to DOT input format
This will modify the test set to the reallocated model selections.

In [None]:
# Load the test set file 
with open('webshop_succeed_finetune_final.json', 'r') as f:
    data = json.load(f)

# Load the reallocated file
with open('Webshop_DOT_Allocation.json', 'r') as f:
    reallocated_data = json.load(f)

# Create a mapping of steps to models
step_to_model = {}
for item in reallocated_data:
    problem = item['problemText']
    step = item['nowSubtask'].split(': ', 1)[1]
    model = 'llama3-8b-8192' if item['difficultyNum'] == 0 else 'gpt-4o'
    
    if problem not in step_to_model:
        step_to_model[problem] = {}
    step_to_model[problem][step] = model

# Update the models in the first file
for item in data:
    problem = item['problem']
    if problem in step_to_model:
        new_models = []
        for step, old_model in zip(item['steps'], item['models']):
            new_model = step_to_model[problem].get(step, old_model)
            new_models.append(new_model)
        item['models'] = new_models

# Save the updated data to a new JSON file
with open('updated_first_file.json', 'w') as f:
    json.dump(data, f, indent=2)

print("Updated JSON file has been saved as 'updated_first_file.json'")