In [1]:
from pathlib import Path
import numpy as np
import pandas as pd
import random
import copy
import ast
import pprint
import json
import re
import time
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
def set_seed():
    random.seed(0)
    np.random.seed(0)

In [3]:
GENERATE_METHODS_DIR = Path('data/docs/manual')
METHODS_DIR = Path('data/docs/methods')
PROMPT_SEEDS_DIR = Path('data/prompts/generation/components')
PROMPT_COMPONENTS_DIR = Path('data/prompts/generation/components')
GEN_PROMPTS_DIR = Path('data/prompts/generation/output')
VAL_PROMPTS_DIR = Path('data/prompts/validation/output')
DATA_DIR = Path('data/')

# device types & locations

In [4]:
DEVICES = list(pd.read_csv(DATA_DIR/'datasets/partial/device_types.csv')['type'])
DEVICES

['Cover', 'Switch', 'Light', 'Input', 'Temperature', 'Smoke', 'Humidity']

In [5]:
DEVICE_LOCATIONS = pd.read_csv(DATA_DIR/'datasets/partial/device_locations.csv')
DEVICE_LOCATIONS

Unnamed: 0,device,location
0,Cover,Bedroom
1,Cover,Living room
2,Cover,Kitchen
3,Cover,Bathroom
4,Cover,Patio
...,...,...
65,Humidity,Basement
66,Humidity,Laundry room
67,Humidity,Garage
68,Humidity,Attic


## EDA

In [28]:
df = pd.read_csv(DATA_DIR / 'docs/methods_json.csv')
df.head()

Unnamed: 0,method,json
0,Input.GetConfig,"{""method"": ""Input.GetConfig"", ""params"": {""type..."
1,Switch.SetConfig,"{""method"": ""Switch.SetConfig"", ""params"": {""typ..."
2,Smoke.GetConfig,"{""method"": ""Smoke.GetConfig"", ""params"": {""type..."
3,Light.ResetCounters,"{""method"": ""Light.ResetCounters"", ""params"": {""..."
4,Cover.GetConfig,"{""method"": ""Cover.GetConfig"", ""params"": {""type..."


In [31]:
def count_parameters(scheme):
    properties = scheme['properties']
    res = len(properties.keys())
    for key, val in properties.items():
        if 'type' in val and val['type'] == 'object':
            res += count_parameters(val)
    return res

df['num_props'] = 0
for i, row in df.iterrows():
    scheme = json.loads(row['json'])['params']
    num_properties = count_parameters(scheme)
    df.loc[i, 'num_props'] = num_properties

df.head()

Unnamed: 0,method,json,num_props
0,Input.GetConfig,"{""method"": ""Input.GetConfig"", ""params"": {""type...",1
1,Switch.SetConfig,"{""method"": ""Switch.SetConfig"", ""params"": {""typ...",16
2,Smoke.GetConfig,"{""method"": ""Smoke.GetConfig"", ""params"": {""type...",1
3,Light.ResetCounters,"{""method"": ""Light.ResetCounters"", ""params"": {""...",2
4,Cover.GetConfig,"{""method"": ""Cover.GetConfig"", ""params"": {""type...",1


In [39]:
adv_df = df[df['method'].str.contains('SetConfig')]
display(adv_df[['method', 'num_props']].sort_values(by='num_props', ascending=False))

Unnamed: 0,method,num_props
25,Cover.SetConfig,28
22,Light.SetConfig,24
27,Input.SetConfig,23
1,Switch.SetConfig,16
6,Humidity.SetConfig,6
29,Temperature.SetConfig,6
37,Smoke.SetConfig,4


In [42]:
basic_df = df[~df['method'].str.contains('SetConfig') & ~df['method'].str.contains('GetConfig') & ~df['method'].str.contains('GetStatus')]
display(basic_df[['method', 'num_props']].sort_values(by='num_props', ascending=False))

Unnamed: 0,method,num_props
36,Light.Set,5
18,Switch.Set,4
19,Cover.GoToPosition,3
8,Cover.Close,2
9,Switch.ResetCounters,2
11,Humidity.SetHumidity,2
13,Input.ResetCounters,2
16,Cover.Open,2
24,Temperature.SetTemperature,2
26,Cover.ResetCounters,2


# seeds generation

In [13]:
target_device = devices[2]

with open('prompts/prompt.md', "w") as f:
    f.write(f'')

with open('prompts/instruction_onedevice.md', "r") as f:
    instruction = f.read()
with open('prompts/seeds_multiapi.md', "r") as f:
    seeds = f.read()

with open('prompts/prompt.md', "a") as f:
    f.write(f'{instruction}\n\n'
            f'Example:\n\n{seeds}\n\n'
            f'For generation: \n\n'
            f'Device: {target_device} id=XXX\n\n')

row_i = 0
for p in Path('docs/').iterdir():
    if target_device in p.name and p.name != f'{target_device}.md':
        with open(p, "r") as f:
            method_str = f.read()
        with open('prompts/prompt.md', "a") as f:
            f.write(f'API method {row_i}: {method_str}\n')
        row_i += 1
    if row_i == 5:
        break

# with open('prompts/prompt.md', "a") as f:
#     f.write(f'User command 0:')

In [49]:
output = """[{"User command 0.1": "Reset the energy counter for the main light.", "JSON command 0.1": {"method": "Light.ResetCounters", "params": {"id": "id", "type": ["energy"]}}}, {"User command 0.2": "Reset the all counter for the light, please.", "JSON command 0.2": {"method": "Light.ResetCounters", "params": {"id": "id"}}}, {"User command 1.1" : "Could you please fetch the configuration details for the living room light, including its name, night mode settings, and transition duration?", "JSON command 1.1": {"method": "Light.GetConfig", "params": {"id": "id"}}}, {"User command 1.2" : "Tell me, what is transition duration for the main light?", "JSON command 1.2": {"method": "Light.GetConfig", "params": {"id": "id"}}}]"""
output = output.replace("XXX", '"id"')

In [50]:
output = re.sub(r'(User command \d\.\d)', 'User command', output)
output = re.sub(r'(JSON command \d\.\d)', 'JSON command', output)

In [51]:
print(output)

[{"User command": "Reset the energy counter for the main light.", "JSON command": {"method": "Light.ResetCounters", "params": {"id": "id", "type": ["energy"]}}}, {"User command": "Reset the all counter for the light, please.", "JSON command": {"method": "Light.ResetCounters", "params": {"id": "id"}}}, {"User command" : "Could you please fetch the configuration details for the living room light, including its name, night mode settings, and transition duration?", "JSON command": {"method": "Light.GetConfig", "params": {"id": "id"}}}, {"User command" : "Tell me, what is transition duration for the main light?", "JSON command": {"method": "Light.GetConfig", "params": {"id": "id"}}}]


In [52]:
import json

In [53]:
cmds = json.loads(output)

In [54]:
json.dumps(cmds)

'[{"User command": "Reset the energy counter for the main light.", "JSON command": {"method": "Light.ResetCounters", "params": {"id": "id", "type": ["energy"]}}}, {"User command": "Reset the all counter for the light, please.", "JSON command": {"method": "Light.ResetCounters", "params": {"id": "id"}}}, {"User command": "Could you please fetch the configuration details for the living room light, including its name, night mode settings, and transition duration?", "JSON command": {"method": "Light.GetConfig", "params": {"id": "id"}}}, {"User command": "Tell me, what is transition duration for the main light?", "JSON command": {"method": "Light.GetConfig", "params": {"id": "id"}}}]'

In [55]:
df_dict = {'device': [], 'user_cmd': [], 'used_mtd': [], 'json_cmd': []}
for row_i in range(len(cmds)):
    df_dict['user_cmd'].append(cmds[row_i][f'User command'])
    df_dict['json_cmd'].append(cmds[row_i][f'JSON command'])
    df_dict['used_mtd'].append(f'{cmds[row_i][f"JSON command"]["method"]}')
    df_dict['device'].append(target_device)
print(df_dict)

{'device': ['Light', 'Light', 'Light', 'Light'], 'user_cmd': ['Reset the energy counter for the main light.', 'Reset the all counter for the light, please.', 'Could you please fetch the configuration details for the living room light, including its name, night mode settings, and transition duration?', 'Tell me, what is transition duration for the main light?'], 'used_mtd': ['Light.ResetCounters', 'Light.ResetCounters', 'Light.GetConfig', 'Light.GetConfig'], 'json_cmd': [{'method': 'Light.ResetCounters', 'params': {'id': 'id', 'type': ['energy']}}, {'method': 'Light.ResetCounters', 'params': {'id': 'id'}}, {'method': 'Light.GetConfig', 'params': {'id': 'id'}}, {'method': 'Light.GetConfig', 'params': {'id': 'id'}}]}


In [56]:
data_path = Path('data/')

In [57]:

df = pd.DataFrame(df_dict)
df.to_csv(data_path / f'prompt_seeds.csv', index=False, mode='a', header=False)

# prompt generation

In [47]:
import pprint

In [48]:
device_methods_dict = {device: {'basic': [], 'advance': []} for device in DEVICES}

advance_methods = ['GetConfig', 'SetConfig', 'GetStatus']
for p in METHODS_DIR.iterdir():
    for d in DEVICES:
        if d in p.name:
            if p.stem.split('.')[1] in advance_methods:
                # device_methods_dict[d]['advance'].append(p.stem)
                continue
            else:
                device_methods_dict[d]['basic'].append(p.stem)

# device_methods_dict['Switch']['basic'].append('Switch.Toggle')
# device_methods_dict['Light']['basic'].append('Light.ResetCounters')

# device_methods_dict['Temperature']['advance'].append('Temperature.GetStatus')
# device_methods_dict['Switch']['advance'].append('Switch.GetStatus')
# device_methods_dict['Smoke']['advance'].append('Smoke.SetConfig')
# device_methods_dict['Temperature']['advance'].append('Temperature.SetConfig')
# device_methods_dict['Switch']['advance'].append('Switch.GetConfig')
# device_methods_dict['Cover']['advance'].append('Cover.SetConfig')

pprint.pprint(device_methods_dict)

{'Cover': {'advance': [],
           'basic': ['Cover.Close',
                     'Cover.Open',
                     'Cover.GoToPosition',
                     'Cover.Calibrate',
                     'Cover.ResetCounters',
                     'Cover.Stop']},
 'Humidity': {'advance': [], 'basic': ['Humidity.SetHumidity']},
 'Input': {'advance': [], 'basic': ['Input.ResetCounters']},
 'Light': {'advance': [],
           'basic': ['Light.ResetCounters',
                     'Light.Toggle',
                     'Light.Calibrate',
                     'Light.Set']},
 'Smoke': {'advance': [], 'basic': ['Smoke.Mute']},
 'Switch': {'advance': [],
            'basic': ['Switch.ResetCounters', 'Switch.Set', 'Switch.Toggle']},
 'Temperature': {'advance': [], 'basic': ['Temperature.SetTemperature']}}


In [49]:
def generate_seed(seeds_df, num_methods=2, num_examples=2):
    rand_device = random.choice(list(seeds_df['device'].unique()))
    seed_methods = random.sample(list(seeds_df[seeds_df['device'] == rand_device]['used_mtd'].unique()), num_methods)
    methods_description = ''
    cmds = {}
    for method_i, method in enumerate(seed_methods):
        with open(METHODS_DIR / f'{method}.md', "r") as f:
            method_str = f.read()
        methods_description += f'API method {method_i+1}: {method_str}\n'
        example_ids = random.sample(list(seeds_df[seeds_df['used_mtd'] == method].index), num_examples)
        for example_i, row_i in enumerate(example_ids):
            cmds[f'Example {method_i+1}.{example_i+1}'] = {f'User command': seeds_df.loc[row_i, 'user_cmd'],
                 f'JSON command': ast.literal_eval(seeds_df.loc[row_i, 'json_cmd'].replace("'", '"'))}
    seed_description = (
        f'The next device, methods and JSON are example:\n\n'
        f'Device: {rand_device} id=33\n\n'
        f'{methods_description}'
        f'{json.dumps({"commands": cmds})}'
    )
    return seed_description

def generate_advance_seed(seeds_df):
    rand_device = random.choice(list(seeds_df['device'].unique()))
    method = random.choice(list(seeds_df[seeds_df['device'] == rand_device]['used_mtd'].unique()))
    cmds = {}
    with open(METHODS_DIR / f'{method}.md', "r") as f:
        method_str = f.read()
    method_description = f'API method 1:\n{method_str}\n'
    sample_method_ids = list(seeds_df[seeds_df['used_mtd'] == method].index)
    example_ids = random.sample(sample_method_ids, min(4, len(sample_method_ids)))
    for example_i, row_i in enumerate(example_ids):
        cmds[f'Example {example_i+1}'] = {f'User command': seeds_df.loc[row_i, 'user_cmd'],
            f'JSON command': ast.literal_eval(seeds_df.loc[row_i, 'json_cmd'].replace("'", '"'))}
    seed_description = (
        f'The next device, methods and JSON are example:\n\n'
        f'Device: {rand_device} id=33\n\n'
        f'{method_description}'
        f'{json.dumps({"commands": cmds})}'
    )
    return seed_description

In [50]:
seeds_df = pd.read_csv(PROMPT_SEEDS_DIR / 'prompt_seeds.csv')
# json.loads(seeds_df.loc[0, 'json_cmd'].replace("'", '"'))['method']
# f"dict: {json.dumps(ast.literal_eval(seeds_df.loc[0, 'json_cmd']))}"
print(generate_seed(seeds_df, num_examples=3))

The next device, methods and JSON are example:

Device: Cover id=33

API method 1: Method name: Cover.Close
Method description:
Properties:
{"id": {"type": "number", "description": "The numeric ID of the Cover component instance"}, "duration": {"type": "number", "description": "If duration is not provided, Cover will fully close, unless it times out because of maxtime_close first. If duration (seconds) is provided, Cover will move in the close direction for the specified time. duration must be in the range [0.1..maxtime_open]Optional"}}
Response:
null on success; error if the request can not be executed or failed

API method 2: Method name: Cover.Open
Method description: 
Properties:
{"id": {"type": "number", "description": "The numeric ID of the Cover component instance"}, "duration": {"type": "number", "description": "If duration is not provided, Cover will fully open, unless it times out because of maxtime_open first. If duration (seconds) is provided, Cover will move in the open di

In [51]:
NUM_METHODS_GEN = 2
NUM_METHODS_SEED = NUM_METHODS_GEN
NUM_EXAMPLES_SEED = 3
EXAMPLES_BASIC_METHOD = NUM_EXAMPLES_SEED
EXAMPLES_ADVANCE_METHOD = 8
REPEAT_TIMES = 3

In [54]:
seeds_df = pd.read_csv(PROMPT_SEEDS_DIR / 'prompt_seeds.csv')

gen_prompts_df = pd.DataFrame(columns=['type', 'device', 'last_mtd_i', 'text'])
for device, methods in device_methods_dict.items():
    for _ in range(REPEAT_TIMES):
        method_i = 0
        methods_description = ''

        for i, method_name in enumerate(methods['basic']):
            with open(METHODS_DIR / f'{method_name}.md', "r") as f:
                method_str = f.read()
            methods_description += f'API method {method_i+1}: {method_str}\n'
            method_i += 1

            if (method_i % NUM_METHODS_GEN == 0 and method_i != 0) or i == len(methods['basic']) - 1:
                with open(PROMPT_COMPONENTS_DIR / 'instruction_onedevice.md', "r") as f:
                    instruction = f.read()
                instruction = instruction.replace('{NUM_EXAMPLE_COMMANDS}', str(method_i*EXAMPLES_BASIC_METHOD))
                instruction = instruction.replace('{EXAMPLES_BASIC_METHOD}', str(EXAMPLES_BASIC_METHOD))

                seed_1 = generate_seed(seeds_df, NUM_METHODS_SEED, NUM_EXAMPLES_SEED)
                prompt_text = (f'{instruction}\n\n'
                            f'{seed_1}\n\n'
                            f'The next device and methods are for you to generate commands:\n\n'
                            f'Device: {device} id=444\n\n'
                            f'{methods_description}')
                gen_prompts_df.loc[len(gen_prompts_df)] = pd.Series({'type': 'basic',
            'device': device,
            'last_mtd_i': i,
            'text': prompt_text})

                method_i = 0
                methods_description = ''
        
        for i, method_name in enumerate(methods['advance']):
            with open(METHODS_DIR / f'{method_name}.md', "r") as f:
                method_str = f.read()
            method_description = f'API method 1:\n{method_str}\n'

            with open(PROMPT_COMPONENTS_DIR / 'instruction_advance.md', "r") as f:
                instruction = f.read()
            instruction = instruction.replace('{NUM_EXAMPLE_COMMANDS}', str(EXAMPLES_ADVANCE_METHOD))

            seed_1 = generate_advance_seed(seeds_df)
            
            prompt_text = (f'{instruction}\n\n'
                        f'{seed_1}\n\n'
                        f'The next device and method are for you to generate commands:\n\n'
                        f'Device: {device} id=444\n\n'
                        f'{method_description}')
            gen_prompts_df.loc[len(gen_prompts_df)] = pd.Series({'type': 'advance',
            'device': device,
            'last_mtd_i': i,
            'text': prompt_text})

gen_prompts_df.to_csv(GEN_PROMPTS_DIR / 'prompts_gen.csv', index=False)

In [55]:
df = pd.read_csv(GEN_PROMPTS_DIR / 'prompts_gen.csv')
df[(df['last_mtd_i'] == 1) & (df['device'] == 'Cover')]

Unnamed: 0,type,device,last_mtd_i,text
0,basic,Cover,1,You are provided with a device in a smart home...
3,basic,Cover,1,You are provided with a device in a smart home...
6,basic,Cover,1,You are provided with a device in a smart home...


# inspection

In [21]:
def parse_output(output):
    output = json.loads(output)['commands']
    df_dict = {'device': [], 'user_cmd': [], 'mtd': [], 'json_cmd': []}
    for cmds in list(output.values()):
        df_dict['user_cmd'].append(cmds[f'User command'])
        df_dict['json_cmd'].append(json.dumps(cmds[f'JSON command']))
        df_dict['mtd'].append(f'{cmds[f"JSON command"]["method"]}')
        df_dict['device'].append(f'{cmds[f"JSON command"]["method"].split(".")[0]}')
    df = pd.DataFrame(df_dict)
    header = not (DATA_DIR / 'prompts/generation/output/prompts_val.csv').exists()
    df.to_csv(DATA_DIR / 'prompts/generation/output/prompts_val.csv', index=False, mode='a', header=header)

# Dataset generation

In [19]:
from openai import OpenAI
with open('data/keys/openai.txt') as f:
    key = f.read()
client = OpenAI(api_key=key)

In [22]:
gen_prompts_df = pd.read_csv(GEN_PROMPTS_DIR / 'prompts_gen.csv')

parsed = []
for _, r in gen_prompts_df.iterrows():
    prompt = r['text']

    print(prompt)
    print('---------------------------------------')

    completion = client.chat.completions.create(
        model="gpt-3.5-turbo-0125",
        response_format={"type": "json_object"},
        messages=[
            {"role": "user", "content": prompt}
        ]
    )
    parse_output(completion.choices[0].message.content)

    print(f'{r["device"]}:{r["last_mtd_i"]} parsed!\n\n')

    print(f'{completion.choices[0].message.content}\n\n')
    print('---------------------------------------')

    parsed.append(f'{r["device"]}:{r["last_mtd_i"]}')

    time.sleep(10)

You are provided with a device in a smart home, its API methods, their descriptions and required parameters. Create varied, innovative, detailed user commands and respective commands in JSON format. User commands must sound naturally. Don't add any mentions of the place where device is located. Also, replace phrase that mentions device in the user command with a string {DEVICE_NAME}. Command to device must be strictly in JSON format. Generate exactly 3 example commands. For each API method generate 3 commands that use this method. You mustn't hallucinate new API methods. In each pair of user command with JSON command use at least half of possible parametrs. Ask about parameters from reponse output if they exist. Don't add to JSON command parameters that are not mentioned in the user command. Output must be strictly JSON array: {"commands": {"Example 1.1": {"User command": "...", "JSON command": {...}}, "Example 1.2": {"User command": "...", "JSON command": {...}}, "Example 1.3": {"User

# Filter methods parameters

In [6]:
a = {0: {1: 1}}
b = a[0]
b[1] = 2
print(a, b)

{0: {1: 2}} {1: 2}


In [7]:
def remove_hallucinated_parameters(gt_json, json_scheme):
    json_temp = gt_json.copy()
    for key, val in json_temp.items():
        if key not in json_scheme:
            gt_json.pop(key, None)
            print(f'hallucinated: {key}')
            continue
        if isinstance(val, dict):
            remove_hallucinated_parameters(val, json_scheme[key]['properties'])

In [8]:
gt_json = {0: {1: 1}, 1: {1: 1}}
json_scheme = {1: {'properties': {1: 1}}}
remove_hallucinated_parameters(gt_json, json_scheme)
gt_json

hallucinated: 0


{1: {1: 1}}

In [10]:
json_schemes_df = pd.read_csv(METHODS_DIR.parent / 'methods_json.csv')
df = pd.read_csv(DATA_DIR / 'prompts/generation/output/prompts_val.csv')

for i, row in df.iterrows():
    method_name = row['mtd']
    method_df = json_schemes_df[json_schemes_df['method'] == method_name]
    if method_df.shape[0] == 0:
        print(f'Incorrect method name in: {i}')
        continue
    json_scheme = json.loads(method_df.iloc[0]['json'])
    print(json_scheme)
    json_cmd = json.loads(row['json_cmd'])
    print(json_cmd)
    remove_hallucinated_parameters(json_cmd, json_scheme)
    print('<<<---------------------------->>>')    
    row['json_cmd'] = json.dumps(json_cmd)

df.to_csv(DATA_DIR / 'prompts/generation/output/prompts_filtered.csv', index=False)

{'method': 'Cover.Close', 'params': {'type': 'object', 'properties': {'id': {'type': 'number', 'description': 'The numeric ID of the Cover component instance'}, 'duration': {'type': 'number', 'description': 'If duration is not provided, Cover will fully close, unless it times out because of maxtime_close first. If duration (seconds) is provided, Cover will move in the close direction for the specified time. duration must be in the range [0.1..maxtime_open]Optional'}}}}
{'method': 'Cover.Close', 'params': {'id': 444}}
<<<---------------------------->>>
{'method': 'Cover.Close', 'params': {'type': 'object', 'properties': {'id': {'type': 'number', 'description': 'The numeric ID of the Cover component instance'}, 'duration': {'type': 'number', 'description': 'If duration is not provided, Cover will fully close, unless it times out because of maxtime_close first. If duration (seconds) is provided, Cover will move in the close direction for the specified time. duration must be in the range [

## EDA

In [None]:
df = pd.read_csv(DATA_DIR / 'datasets/partial/templated_actions.csv')
df.head()

Unnamed: 0,device,user_cmd,mtd,json_cmd
0,Switch,Set the {DEVICE_NAME} to flip mode.,Switch.SetConfig,"{""method"": ""Switch.SetConfig"", ""params"": {""id""..."
1,Switch,Set the {DEVICE_NAME} to cycle mode.,Switch.SetConfig,"{""method"": ""Switch.SetConfig"", ""params"": {""id""..."
2,Switch,Enable Automatic OFF for the kitchen {DEVICE_N...,Switch.SetConfig,"{""method"": ""Switch.SetConfig"", ""params"": {""id""..."
3,Switch,Set {DEVICE_NAME} name to 'Bed Lamp'.,Switch.SetConfig,"{""method"": ""Switch.SetConfig"", ""params"": {""id""..."
4,Switch,Set the hallway {DEVICE_NAME} power limit to 5...,Switch.SetConfig,"{""method"": ""Switch.SetConfig"", ""params"": {""id""..."


In [None]:
df.groupby('mtd').count()

Unnamed: 0_level_0,device,user_cmd,json_cmd
mtd,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Cover.Calibrate,6,6,6
Cover.Close,6,6,6
Cover.GetConfig,16,16,16
Cover.GetStatus,16,16,16
Cover.GoToPosition,6,6,6
Cover.Open,5,5,5
Cover.ResetCounters,6,6,6
Cover.SetConfig,10,10,10
Cover.Stop,6,6,6
Humidity.GetConfig,12,12,12


# Add DEVICE_NAME

In [4]:
from openai import OpenAI
with open('data/keys/openai.txt') as f:
    key = f.read()
client = OpenAI(api_key=key)

In [5]:
def parse_output(output, rows, output_path):
    output = json.loads(output)
    df_dict = {'id': [], 'device': [], 'user_cmd': [], 'mtd': [], 'json_cmd': []}
    for i in range(len(rows)):
        df_dict['user_cmd'].append(output[f'Edited command {i+1}'])
        df_dict['json_cmd'].append(rows[i]['json_cmd'])
        df_dict['mtd'].append(rows[i]['mtd'])
        df_dict['device'].append(rows[i]['device'])
        df_dict['id'].append(rows[i]['id'])
    df = pd.DataFrame(df_dict)
    header = not (output_path).exists()
    df.to_csv(output_path, index=False, mode='a', header=header)

In [6]:
df = pd.read_csv(DATASET_PATH)

with open(PROMPT_COMPONENTS_DIR / 'instruction_device_name.md') as f:
    instruction = f.read()

output_path = Path('data/datasets/dataset_v6.csv')

devices = []
commands = []
rows = []
num_send_commands = 6
for i in range(60, df.shape[0]): # df.shape[0]
    device = df.iloc[i]['device']
    user_cmd = df.iloc[i]['user_cmd']

    devices.append(device)
    commands.append(user_cmd)
    rows.append(df.iloc[i])

    if (len(devices) % 6 == 0 and len(devices) != 0) or i == df.shape[0] - 1:
        prompt = [f'Device type {i+1}: {device_type}\nUser command {i+1}: {command}'
                  for i, (device_type, command) in enumerate(zip(devices, commands))]
        prompt = '\n\n'.join(prompt)
        prompt = (f'{instruction}\n\n'
                  f'{prompt}')

        print(prompt)
        print('---------------------------------------')

        completion = client.chat.completions.create(
            model="gpt-3.5-turbo-0125",
            response_format={"type": "json_object"},
            messages=[
                {"role": "user", "content": prompt}
            ]
        )
        parse_output(completion.choices[0].message.content, rows, output_path)

        print(f'{completion.choices[0].message.content}\n\n')
        print('---------------------------------------')

        devices = []
        commands = []
        rows = []

        time.sleep(10)

You are provided with the device type and the user command. Firstly, remove any mentions of the place where device is located. Then, replace phrase that mentions device in the user command with a string {{DEVICE_NAME}}. Output must be strictly JSON object: {"Edited command 1": "...", "Edited command 2": "...", "Edited command 3": "..."}. Don't output anything else. There is an example below.

Device type 1: Humidity
User command 1: Update me on the humidity in the lounge.

Device type 2: Smoke
User command 2: Set the name of the smoke sensor to 'Office' now.

Device type 3: Temperature
User command 3: Set the temperature report threshold of the kitchen thermometer to 1C.

Device type 4: Input
User command 4: Tell me the status of the input in the guest room, please.

Device type 5: Light
User command 5: Disable auto on function for the office light.

Device type 6: Switch
User command 6: Check the instantaneous active power delivered by the switch in the bedroom.

{"Edited command 1": 

## Locations

In [36]:
# locations = {"Cover": ["Bedroom", "Living room", "Kitchen", "Bathroom", "Patio", "Garage", "Office", "Basement", "Attic", "Outdoor"], "Switch": ["Living room", "Bedroom", "Kitchen", "Dining area", "Bathroom", "Garage", "Outdoor", "Home office", "Hallway", "Entryway"], "Light": ["Living room", "Bedroom", "Kitchen", "Dining area", "Bathroom", "Outdoor", "Study room", "Basement", "Attic", "Closet"], "Input": ["Living room", "Bedroom", "Kitchen", "Home office", "Bathroom", "Garage", "Outdoor", "Study room", "Basement", "Attic"], "Temperature": ["Living room", "Bedroom", "Kitchen", "Bathroom", "Basement", "Attic", "Greenhouse", "Outdoor", "Wine cellar", "HVAC room"], "Smoke": ["Kitchen", "Living room", "Bedroom", "Garage", "Office", "Basement", "Attic", "Hallway", "Entryway", "Laundry room"], "Humidity": ["Bathroom", "Kitchen", "Living room", "Bedroom", "Greenhouse", "Basement", "Laundry room", "Garage", "Attic", "Plant nursery"]}

In [37]:
# df_dict = {'device': [], 'location': []}
# for device, places in locations.items():
#     df_dict['device'] = df_dict['device'] + [device] * len(places)
#     df_dict['location'] = df_dict['location'] + places
# df = pd.DataFrame(data=df_dict)
# df.to_csv('data/datasets/partial/locations.csv', index=False)

## Split dataset

In [22]:
templated_actions = pd.read_csv(DATA_DIR / 'datasets/partial/templated_actions.csv')

In [25]:
set_seed()

train_idx = int(len(templated_actions) * 0.8)
val_idx = train_idx + int(len(templated_actions) * 0.1)

df = templated_actions.sample(frac=1)
train_df = df.iloc[:train_idx]
val_df = df.iloc[train_idx:val_idx]
test_df = df.iloc[val_idx:]

train_df.to_csv(DATA_DIR / 'datasets/partial/templated_actions_train.csv', index=False)
val_df.to_csv(DATA_DIR / 'datasets/partial/templated_actions_val.csv', index=False)
test_df.to_csv(DATA_DIR / 'datasets/partial/templated_actions_test.csv', index=False)

In [28]:
train_df.shape

(432, 4)

In [30]:
len(test_df['mtd'].unique())

29

In [15]:
types_environments_by_num = 6
num_locations_per_device = 10
num_device_types = 7
num_functions_per_device = 5
print(f'{types_environments_by_num * num_locations_per_device * num_device_types * num_functions_per_device} possible combinations per one unique prompt')

2100 possible combinations per one unique prompt


## Generate merged dataset

In [34]:
import random

In [35]:
def generate_env(target_type, target_name):
    possible_ids = list(range(1, 99))

    target_id = random.choice(possible_ids)
    possible_ids.remove(target_id)

    env_devices = [f'{target_type}, name="{target_name}", id={target_id}']

    generated_device_names = [target_name]
    num_devices = np.random.randint(3, 8)
    for i in range(num_devices - 1):
        device_name = target_name
        while device_name in generated_device_names:
            device_type = random.choice(DEVICES)
            location = random.choice(list(DEVICE_LOCATIONS[DEVICE_LOCATIONS['device'] == device_type]['location']))
            number = np.random.randint(1, 5)
            device_name = f'{location} {device_type} {number}'
        device_id = random.choice(possible_ids)

        env_devices.append(f'{device_type}, name="{device_name}", id={device_id}')

        generated_device_names.append(device_name)
        possible_ids.remove(device_id)
        
    random.shuffle(env_devices)
    env = ';\n'.join(env_devices)

    return env, target_id

In [36]:
set_seed()

NUM_DATASET = 0

for split in ['train', 'val', 'test']:
    templated_actions = pd.read_csv(DATA_DIR / f'datasets/partial/templated_actions_{split}.csv')
    out_df = pd.DataFrame(columns=['id','device','location','number','device_name','device_id','env','user_cmd','mtd','json_cmd'])

    SAMPLES_PER_TEMPLATE = 15

    row_id = 0
    for i, row in templated_actions.iterrows():
        for _ in range(SAMPLES_PER_TEMPLATE):
            device = row['device']

            location = random.choice(list(DEVICE_LOCATIONS[DEVICE_LOCATIONS['device'] == device]['location']))
            number = np.random.randint(1, 5)
            device_name = f'{location} {device} {number}'
            
            if not '{DEVICE_NAME}' in row['user_cmd']:
                print(i)
            user_cmd = row['user_cmd'].format(**{'DEVICE_NAME': device_name})

            env, device_id = generate_env(device, device_name)

            json_cmd = row['json_cmd']
            json_cmd = json_cmd.replace('444', str(device_id))

            sample_dict = {
                'id': row_id,
                'device': row['device'],
                'location': location,
                'number': number,
                'device_name': device_name,
                'device_id': device_id,
                'env': env,
                'user_cmd': user_cmd,
                'mtd': row['mtd'],
                'json_cmd': json_cmd
            }
            out_df.loc[len(out_df)] = pd.Series(sample_dict)

            row_id += 1

    out_df.to_csv(DATA_DIR / f'datasets/merged/{split}_{NUM_DATASET}.csv', index=False)

In [37]:
train_df = pd.read_csv(DATA_DIR / f'datasets/merged/train_0.csv')
train_df.shape

(6480, 12)

In [8]:
# templated_actions = pd.read_csv(DATA_DIR / 'datasets/partial/templated_actions.csv')
# templated_actions = templated_actions.drop(columns='id')
# templated_actions.to_csv(DATA_DIR / 'datasets/partial/templated_actions.csv', index=False)

## Add retrieved methods

In [21]:
from llama_index.core import (
    Settings,
    StorageContext,
    load_index_from_storage,
)
from llama_index.core.embeddings.utils import resolve_embed_model
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core import Settings

ModuleNotFoundError: No module named 'llama_index'

In [7]:
Settings.embed_model = resolve_embed_model("local:BAAI/bge-base-en-v1.5")

# rebuild storage context
storage_context = StorageContext.from_defaults(persist_dir=(DATA_DIR / 'persist_dir'))

# load index
index = load_index_from_storage(storage_context, show_progress=True)

In [8]:
retriever = VectorIndexRetriever(
    index=index,
    similarity_top_k=30,
)

In [24]:
def get_methods_description(retrieved_nodes, gt_mtd):
    methods_names = []
    methods_description = ''
    for k, node in enumerate(retrieved_nodes):
        methods_description += f'API method {k}:\n{node.text}\n\n'
        method_name = node.metadata['file_name'].replace('.md', '')
        methods_names.append(method_name)
    
    with open(DATA_DIR / 'docs/methods' / f'{gt_mtd}.md') as f:
        gt_mtd_description = f.read()
    methods_description += f'API method {k+1}:\n{gt_mtd_description}'
    methods_names.append(gt_mtd)

    methods_names = ','.join(methods_names)

    return methods_names, methods_description

In [5]:
allowed_methods = []

advance_methods = ['GetConfig', 'SetConfig', 'GetStatus']
for p in METHODS_DIR.iterdir():
    if p.stem.split('.')[1] in advance_methods:
        continue
    else:
        allowed_methods.append(p.stem)

print(allowed_methods)

['Switch.Toggle', 'Cover.Stop', 'Cover.Calibrate', 'Temperature.SetTemperature', 'Light.ResetCounters', 'Light.Set', 'Switch.ResetCounters', 'Light.Toggle', 'Cover.GoToPosition', 'Cover.ResetCounters', 'Light.Calibrate', 'Smoke.Mute', 'Humidity.SetHumidity', 'Cover.Open', 'Switch.Set', 'Cover.Close', 'Input.ResetCounters']


In [None]:
DATASET_NUM = 0

for split in ['train', 'val']:
    df = pd.read_csv(DATA_DIR / f'datasets/merged/{split}_{DATASET_NUM}.csv')
    df['methods_descr'] = ''
    df['methods_names'] = ''

    for i, row in df.iterrows():
        user_cmd = row['user_cmd']
        gt_mtd = row['mtd']

        retrieval_prompt = "Represent this sentence for searching relevant passages: " + user_cmd #.format(DEVICE_NAME=row['device'])
        retrieved_nodes = retriever.retrieve(retrieval_prompt)

        cleaned_notes = []
        for node in retrieved_nodes:
            mtd_name = node.metadata['file_name'].replace('.md', '')
            if mtd_name != gt_mtd and mtd_name in allowed_methods:
                cleaned_notes.append(node)
            if len(cleaned_notes) == 2:
                break
        
        methods_names, methods_description = get_methods_description(cleaned_notes, gt_mtd)

        df.loc[i, 'methods_descr'] = methods_description
        df.loc[i, 'methods_names'] = methods_names

    df.to_csv(DATA_DIR / f'datasets/merged/{split}_{DATASET_NUM}.csv', index=False)