In [3]:
import yaml
import itertools
from copy import deepcopy


with open('sweep_definition.yaml', 'r') as file:
    sweep_config = yaml.safe_load(file)

In [4]:

def flatten_dict(variant_dict):
    variant_name = list(variant_dict.keys())[0]
    variant_values = variant_dict[variant_name]
    result_dict = {'variant_name': variant_name}
    result_dict.update(variant_values)
    return result_dict

def cartesian_product(*lists):
    cartesian_product = []
    for items in itertools.product(*lists):
        merged_dict = {}
        for item in items:
            merged_dict.update(item)
        cartesian_product.append(merged_dict)
    return cartesian_product

def duplicate_dict(dictionary):
    result = [dictionary]
    for key, value in dictionary.items():
        if isinstance(value, list):
            temp_result = []
            for item in value:
                for res in result:
                    temp_dict = res.copy()
                    temp_dict[key] = item
                    temp_result.append(temp_dict)
            result = temp_result
    return result



search_space =[]
for node in sweep_config.get('search_space'):
    if node in ['llm','embedding','vector_store']:
        grid = []
        node_values = sweep_config.get('search_space').get(node)
        for variant in node_values:
            step_dict ={}
            step_dict[node] = flatten_dict(variant)
            grid.append(step_dict)
        grid_final = []
        for step in grid:
            
            node_id = list(step.keys())[0]    
            fixed = {key:value for key,value in step.get(node_id).items() if not isinstance(value, dict)}
            for key, value in step.get(node_id).items():
                option_list =[]
                if isinstance(value, dict):
                    options = duplicate_dict(value)
                    for option in options:
                        f = deepcopy(fixed)
                        f[key] = option.get('values')
                        option_list.append(f)    
            
            if len(option_list)>1:
                for opt in option_list:
                    grid_final.append({node_id: opt})
            else:
                grid_final.append({node_id: fixed})
        search_space.append(grid_final)

In [5]:
grid_search_steps = cartesian_product(*search_space)

In [36]:
grid_search_steps[3]

{'llm': {'variant_name': 'mistral',
  'family': 'MISTRAL',
  'connection': 'mistral-large-maas'},
 'embedding': {'variant_name': 'aoai',
  'family': 'AZUREOPENAI',
  'connection': 'aoai',
  'deployment': 'text-embedding-ada-002'}}

In [9]:
import json
import subprocess

file_path = "./rag_flow_template/cookiecutter_template.json"
with open(file_path, "r") as file:
    data = json.load(file)



In [29]:
from unique_names_generator import get_random_name
import random
import string


def generate_step_id():
    name = get_random_name(separator="_", style="lowercase")
    unique_id = ''.join(random.choices(string.ascii_lowercase + string.digits, k=8))
    return name+'_'+ unique_id

# Generate a unique ID
unique_id = generate_step_id()
print(unique_id)


def create_cookiecutter(data, grid_step):
    data_dict = deepcopy(data)
    for node_id in data_dict:
        if node_id in ['llm','embedding','vector_store'] and node_id in grid_step.keys():
            data_dict[node_id]=grid_step[node_id]
        elif node_id in ['llm','embedding','vector_store'] and node_id not in grid_step.keys():
            del data_dict[node_id]
    return data_dict

blush_fly_z3ktl6dy


In [35]:
import os
file_path = "./rag_flow_template/cookiecutter.json"

for i, search_step_dict in enumerate(grid_search_steps):
    directory = 'flow_versions'
    if not os.path.exists(directory):
        os.makedirs(directory)
    
    step_dict = create_cookiecutter(data, search_step_dict)
    step_dict["flow_name"] = f"grid_step_{i}"
    
    with open(file_path, "w") as file:
        json.dump(step_dict, file, indent=4)
    
    subprocess.run(["cd ./flow_versions && cookiecutter ../rag_flow_template --no-input --skip-if-file-exists"], shell=True)


In [1]:
from azure.identity import DefaultAzureCredential, InteractiveBrowserCredential
from promptflow.azure import PFClient


credential = DefaultAzureCredential()
  
# Get a handle to workspace, it will use config.json in current and parent directory.
pf = PFClient.from_config(
    credential=credential,
)

Found the config file in: /home/vhoudebine/config.json


In [27]:
import os

def create_and_run_flow(i):
    # Apply your function to the dictionary here
    flow = os.path.join("./flow_versions", f"rag_flow_grid_step_{i}")
    data ="./evaluation_data/data.jsonl"
    print(f"Creating run for {flow}")
    # create run
    base_run = pf.run(
        flow=flow,
        data=data,
        column_mapping={
        "question": "${data.question}"
        }
    )
    print(f"Created run {base_run.name}")
    return base_run


# Create a thread for each dictionary
runs = []
for i, search_step_dict in enumerate(grid_search_steps):
    run = create_and_run_flow(i)
    runs.append(run)

Creating run for ./flow_versions/rag_flow_grid_step_0


[32mUploading rag_flow_grid_step_0 (0.0 MBs): 100%|██████████| 514/514 [00:00<00:00, 4185.59it/s]
[39m



Portal url: https://ml.azure.com/runs/rag_flow_grid_step_0_variant_0_20240404_200429_297972?wsid=/subscriptions/6c065ea7-65cd-4a34-8e2a-3e21ad4a8e9f/resourcegroups/vince-rg/providers/Microsoft.MachineLearningServices/workspaces/vince-dev
Created run rag_flow_grid_step_0_variant_0_20240404_200429_297972
Creating run for ./flow_versions/rag_flow_grid_step_1




Portal url: https://ml.azure.com/runs/rag_flow_grid_step_1_variant_0_20240404_200441_308304?wsid=/subscriptions/6c065ea7-65cd-4a34-8e2a-3e21ad4a8e9f/resourcegroups/vince-rg/providers/Microsoft.MachineLearningServices/workspaces/vince-dev
Created run rag_flow_grid_step_1_variant_0_20240404_200441_308304


In [28]:
run = runs[0]