In [2]:
from tqdm import tqdm
from pprint import pprint
from collections import defaultdict
import json
import pdb
import sys
import os

from minimal_webshop.envs.web_agent_text_env import WebAgentTextEnv
from minimal_webshop.engine.goal import get_human_goals, get_synthetic_goals
import random
from pprint import pprint

from openai import OpenAI
import yaml

secret = yaml.safe_load(open('../secrets.yaml', 'r'))
client = OpenAI(api_key=secret['OPENAI_API_KEY'])


env = WebAgentTextEnv(observation_mode='text_rich', human_goals=True)
s = env.browser.server
all_products = s.all_products
product_prices = s.product_prices
product_asins = [p['asin'] for p in all_products]

goals = env.server.goals
print(f"Number of goals: {len(goals)}")




100%|██████████| 1181436/1181436 [00:23<00:00, 51354.90it/s] 


Loaded 12087 goals.
Number of goals: 12087


In [67]:
def print_product(asin):
    product_idx = product_asins.index(asin)
    product = all_products[product_idx]

    print(f"ASIN: {product['asin']}")
    print(f"Name: {product['name']}")
    print(f"Product category: {product['product_category']}")
    print(f"Relevant search query: {product['query']}")
    print(f"Price: {product['pricing']}")
    print(f"Attributes: {product['Attributes']}")
    print("Options:")
    for option_type, options in product['options'].items():
        print(f"- {option_type}:")
        for option in options:
            print(f"\t{option}" + (f" (image: {product['option_to_image'][option]})" if (option in product['option_to_image'] and product['option_to_image'][option]) else ""))

legit_products = [product for product in all_products \
                  if 2 <= len(product['options']) <= 5 \
                  and 3 <= len(product['Attributes']) <= 6 \
                  and all(2 <= len(product['options'][option_type]) <= 6 for option_type in product['options']) \
                  and product['product_category']
                ]
print(f"Number of products with at least 2 options and 4 attributes: {len(legit_products)}")

categories = set([product['category'] for product in all_products])
print(f"Number of product categories: {len(categories)}")
category2products = {c: [p for p in legit_products if p['category'] == c] for c in categories}
print(f"Number of products per category:")
for c, products in category2products.items():
    print(f"\t{c}: {len(products)}")

Number of products with at least 2 options and 4 attributes: 47149
Number of product categories: 5
Number of products per category:
	grocery: 1688
	beauty: 2341
	electronics: 3184
	garden: 12400
	fashion: 27536


## v1 Scenario goals

In [3]:
product_identification_prompt = """Product name: {product_name}
Product category: {product_category}
Relevant search query: {query}

Infer what the desired product is, based on the product name,product category and search query. Your output should be a one or few word object (e.g. desk chair, cheese dip, bed frame, etc.)
"""

option_type_selection_prompt = """Desired product: {product_type}
Set of product options available: {option_types}

For the desired product, identify which options, out of the ones available, a person will absolutely care about when buying that desired product. For example, if the product is a t-shirt, the person will definitely care about the color, size and material, but maybe not the pattern. On the other hand, if the product is a pen drive, the size (capacity) will be important, but the color will not be.
Your output should be a comma-separated list of relevant option types, without any additional text or punctuation.
"""

option_selection_prompt = """Product name: {product_name}
Desired option types: {desired_option_types}
Dictionary of option types to possible option values:  {options_dict}

For each of the desired option types above, and each of the possible option values for that option type, identify which option values are mentioned in the product name. If the product name does not mention an option value, then select a random option value for that option type.

Your output should be a dictionary where the keys are the desired option types, and the values are the selected option values.
"""

scenario_creation_prompt = """Product: {product_type}
Product attributes: {attributes}
Options that the scenario must still be valid for: {goal_options}

Create a scenario where a person would want to buy a product with the listed attributes. 
- It should be clear from the scenario that the person will want to buy that specific product, not something similar but different. You can explicitly mention the product to remove any ambiguity (e.g. "You want to buy a hair straightener for...").
- The scenario should not explicitly mention the attributes, but the constraints described by the scenario should lead a user to want to buy the product with those specific attributes. 
- The scenario can also mention some characteristics of the person that may be relevant in motivating them to buy the product with those attributes (e.g. "You have long thick hair...")
- The scenario should also be valid for the listed options, but it should not explicitly mention them. 
- The scenario should be 1-2 sentences long, and be written in the second person (e.g. "You are a..."). 
"""


def generate_scenario(product):
    goal = {}

    sampled_attributes = random.sample(product['Attributes'], k=random.randint(2, 4))
    sampled_option_types = random.sample(list(product['options'].keys()), k=random.randint(1, min(3, len(product['options']))))
    sampled_options = []
    for option_type in sampled_option_types:
        sampled_options.append(random.choice(product['options'][option_type]))

    goal['asin'] = product['asin']
    goal['attributes'] = sampled_attributes
    goal['name'] = product['name']
    goal['product_category'] = product['product_category']
    goal['query'] = product['query']
    goal['weight'] = 1.0

    # First identify the type of product that needs to be bought
    input_message = product_identification_prompt.format(
        product_name=product['name'],
        product_category=product['product_category'],
        query=product['query']
    )
    gpt_response = client.chat.completions.create(
        model="gpt-4.1",
        messages=[
            {"role": "system", "content": "You are a helpful assistant that identifies products from their name, category and search query."},
            {"role": "user", "content": input_message}
        ],
        temperature=0.0,
        max_tokens=100
    )
    product_type = gpt_response.choices[0].message.content.strip()
    goal['product_type'] = product_type
    print(f"Step 1. Identified product type: {product_type}")

    # Second, select the option types that are relevant for the product
    input_message = option_type_selection_prompt.format(
        product_type=product_type,
        query=product['query'],
        option_types=', '.join(product['options'].keys()),
    )
    gpt_response = client.chat.completions.create(
        model="gpt-4.1",
        messages=[
            {"role": "system", "content": "You are a helpful assistant that identifies relevant product options."},
            {"role": "user", "content": input_message}
        ],
        temperature=0.0,
        max_tokens=100
    )
    sampled_option_types = gpt_response.choices[0].message.content.strip().split(',')
    sampled_option_types = [option_type.strip() for option_type in sampled_option_types if option_type.strip() in product['options']]
    assert len(sampled_option_types) > 0, "No relevant option types found for the product."
    print(f"Step 2. Identified the relevant option types: {sampled_option_types}")

    # Then, identify the option values for those option types
    input_message = option_selection_prompt.format(
        product_name=product['name'],
        desired_option_types=', '.join(sampled_option_types),
        options_dict=json.dumps(product['options'])
    )
    gpt_response = client.chat.completions.create(
        model="gpt-4.1",
        messages=[
            {"role": "system", "content": "You are a helpful assistant that identifies option values for a product."},
            {"role": "user", "content": input_message}
        ],
        temperature=0.0,
        max_tokens=100
    )
    dict_string = gpt_response.choices[0].message.content.strip()
    sampled_options_dict = json.loads(dict_string)
    sampled_options = list(sampled_options_dict.values())
    goal['goal_options'] = sampled_options
    goal['goal_options_dict'] = sampled_options_dict
    print(f"Step 3. Identified the following options: {goal['goal_options']}")

    # Finally, generate the scenario based on the product category, query, attributes and options
    input_message = scenario_creation_prompt.format(
        product_type=product_type,
        attributes=', '.join(goal['attributes']),
        goal_options=', '.join(goal['goal_options']),
    )
    gpt_response = client.chat.completions.create(
        model="gpt-4.1",
        messages=[
            {"role": "system", "content": "You are a helpful assistant that generates scenarios for product goals."},
            {"role": "user", "content": input_message}
        ],
        temperature=1.0,
        max_tokens=100
    )
    scenario = gpt_response.choices[0].message.content.strip()
    goal['instruction_text'] = scenario
    print(f"Step 4. Generated scenario: {goal['instruction_text']}")
    return goal



In [None]:
category = 'garden'
products = category2products[category]

product = random.choice(products)
print("ORIGINAL PRODUCT:")
print_product(product['asin'])
print("-"*100)

# choice = input("Do this product? (y/n): ")
choice = 'y'  # For testing purposes, we automatically proceed with the product
if choice.strip().lower() == 'y':        
    print(f"GENERATING SCENARIO:")
    goal = generate_scenario(product)
    print("-" * 100)

    print("GENERATED REALISTIC GOAL:")
    pprint(goal)


In [None]:
pprint(product)

## v2 Scenario Creation

In [100]:
product_identification_prompt = """Product name: {product_name}
Product category: {product_category}

Infer what the desired product is, based on the product name and product category. Your output should be a one or few word object (e.g. desk chair, cheese dip, bed frame, etc.)
"""

option_type_selection_prompt = """Desired product: {product_type}
Set of product options available: {option_types}

For the desired product, identify which options, out of the ones available, a person will absolutely care about when buying that desired product. For example, if the product is a t-shirt, the person will definitely care about the color, size and material, but maybe not the pattern. On the other hand, if the product is a pen drive, the size (capacity) will be important, but the color will not be.
Your output should be a comma-separated list of relevant option types, without any additional text or punctuation.
"""

option_selection_prompt = """Product name: {product_name}
Desired option types: {desired_option_types}
Dictionary of option types to possible option values:  {options_dict}

For each of the desired option types above, and each of the possible option values for that option type, identify which option values are mentioned in the product name. If the product name does not mention an option value, then select a random option value for that option type.

Your output should be a dictionary where the keys are the desired option types, and the values are the selected option values.
"""

scenario_creation_prompt = """Product: {product_type}
Product attributes: {attributes}
Options that the scenario must still be valid for: {goal_options}

Create a scenario where a person would want to buy a product with the listed attributes. 
- It should be clear from the scenario that the person will want to buy that specific product, not something similar but different. You can explicitly mention the product to remove any ambiguity (e.g. "You want to buy a hair straightener for...").
- The scenario should not explicitly mention the attributes, but the constraints described by the scenario should lead a user to want to buy the product with those specific attributes. 
- The scenario can also mention some characteristics of the person that may be relevant in motivating them to buy the product with those attributes (e.g. "You have long thick hair...")
- The scenario should also be valid for the listed options, but it should not explicitly mention them. 
- The scenario should be 1-2 sentences long, and be written in the second person (e.g. "You are a..."). 
"""

price_upper_prompt = """Product: {product_name}
Product price: {product_price}

Given the product name and price, identify the maximum price that a person would be willing to pay for the product (must be higher than the product price). Your output should be a whole number (no decimals, e.g. $5.99), preferably rounded to a multiple of 5 or 10, without any additional text or punctuation.
"""


def generate_scenario(product):
    goal = {}

    sampled_attributes = random.sample(product['Attributes'], k=random.randint(2, 4))
    sampled_option_types = random.sample(list(product['options'].keys()), k=random.randint(1, min(3, len(product['options']))))
    sampled_options = []
    for option_type in sampled_option_types:
        sampled_options.append(random.choice(product['options'][option_type]))

    goal['asin'] = product['asin']
    goal['attributes'] = sampled_attributes
    goal['name'] = product['name']
    goal['product_category'] = product['product_category']
    goal['query'] = product['query']
    goal['weight'] = 1.0
    goal['category'] = product['category']

    # First identify the type of product that needs to be bought
    input_message = product_identification_prompt.format(
        product_name=product['name'],
        product_category=product['product_category'],
        # query=product['query']
    )
    gpt_response = client.chat.completions.create(
        model="gpt-4.1",
        messages=[
            {"role": "system", "content": "You are a helpful assistant that identifies products from their name, category and search query."},
            {"role": "user", "content": input_message}
        ],
        temperature=0.0,
        max_tokens=100
    )
    product_type = gpt_response.choices[0].message.content.strip()
    goal['product_type'] = product_type
    print(f"Step 1. Identified product type: {product_type}")

    # Second, select the option types that are relevant for the product
    input_message = option_type_selection_prompt.format(
        product_type=product_type,
        query=product['query'],
        option_types=', '.join(product['options'].keys()),
    )
    gpt_response = client.chat.completions.create(
        model="gpt-4.1",
        messages=[
            {"role": "system", "content": "You are a helpful assistant that identifies relevant product options."},
            {"role": "user", "content": input_message}
        ],
        temperature=0.0,
        max_tokens=100
    )
    sampled_option_types = gpt_response.choices[0].message.content.strip().split(',')
    sampled_option_types = [option_type.strip() for option_type in sampled_option_types if option_type.strip() in product['options']]
    assert len(sampled_option_types) > 0, "No relevant option types found for the product."
    print(f"Step 2. Identified the relevant option types: {sampled_option_types}")

    # Then, identify the option values for those option types
    input_message = option_selection_prompt.format(
        product_name=product['name'],
        desired_option_types=', '.join(sampled_option_types),
        options_dict=json.dumps(product['options'])
    )
    gpt_response = client.chat.completions.create(
        model="gpt-4.1",
        messages=[
            {"role": "system", "content": "You are a helpful assistant that identifies option values for a product."},
            {"role": "user", "content": input_message}
        ],
        temperature=0.0,
        max_tokens=100
    )
    dict_string = gpt_response.choices[0].message.content.strip()
    sampled_options_dict = json.loads(dict_string)
    sampled_options = list(sampled_options_dict.values())
    goal['goal_options'] = sampled_options
    goal['goal_options_dict'] = sampled_options_dict
    print(f"Step 3. Identified the following options: {goal['goal_options']}")

    # Finally, generate the scenario based on the product category, query, attributes and options
    input_message = scenario_creation_prompt.format(
        product_type=product_type,
        attributes=', '.join(goal['attributes']),
        goal_options=', '.join(goal['goal_options']),
    )
    gpt_response = client.chat.completions.create(
        model="gpt-4.1",
        messages=[
            {"role": "system", "content": "You are a helpful assistant that generates scenarios for product goals."},
            {"role": "user", "content": input_message}
        ],
        temperature=1.0,
        max_tokens=100
    )
    scenario = gpt_response.choices[0].message.content.strip()
    goal['instruction_text'] = scenario
    print(f"Step 4. Generated scenario: {goal['instruction_text']}")

    input_message = price_upper_prompt.format(
        product_name=product['name'],
        product_price=product['pricing']
    )
    gpt_response = client.chat.completions.create(
        model="gpt-4.1",
        messages=[
            {"role": "system", "content": "You are a helpful assistant that identifies the maximum price that a person would be willing to pay for a product."},
            {"role": "user", "content": input_message}
        ],
        temperature=0.0,
        max_tokens=100
    )
    price_upper = gpt_response.choices[0].message.content.strip()
    goal['price_upper'] = float(price_upper)
    print(f"Step 5. Identified the maximum price that a person would be willing to pay for the product: {goal['price_upper']}")
    return goal



In [None]:
category = 'beauty'
products = category2products[category]

product = random.choice(products)
print("ORIGINAL PRODUCT:")
print_product(product['asin'])
print("-"*100)

In [None]:
print(f"GENERATING SCENARIO:")
goal = generate_scenario(product)
print("-" * 100)

print("GENERATED REALISTIC GOAL:")
pprint(goal)

In [None]:
filename = "data_collection_interface/scenario_data/v2_50scenarios.json"
if not os.path.exists(filename):
    with open(filename, "w") as f:
        json.dump([], f)

with open(filename, "r") as f:
    scenarios = json.load(f)
num_samecategory_already = len([s for s in scenarios if s['category'] == goal['category']])
goal['scenario_id'] = f"{goal['category']}{num_samecategory_already:03d}"
scenarios.append(goal)
with open(filename, "w") as f:
    json.dump(scenarios, f, indent=4)
print(f"Wrote {len(scenarios)} scenarios to {filename}")
print(f"Last scenario ID: {goal['scenario_id']}")

In [190]:
with open(filename, "r") as f:
    scenarios = json.load(f)
scenarios.sort(key=lambda x: x['scenario_id'])
json.dump(scenarios, open(filename, "w"), indent=4)

In [None]:
# tab-separated values
print("Scenario ID\tInstruction Text\tActual Product Name\tProduct Attributes\tProduct Options")
for s in scenarios:
    print(f"{s['scenario_id']}\t{s['instruction_text']}\t{s['name']}\t{', '.join(s['attributes'])}\t{s['goal_options_dict']}")


In [None]:
for category in ['beauty', 'garden', 'electronics', 'fashion', 'grocery']:
    category_scenarios = [s for s in scenarios if s['category'] == category]
    print(category)
    for s in category_scenarios:
        print(f"- {s['product_type']}")

## v3 Scenarios

In [68]:
product_identification_prompt = """Product name: {product_name}
Product category: {product_category}

Infer what the desired product is, based on the product name and product category. Your output should be a one or few word object (e.g. desk chair, cheese dip, bed frame, etc.). It should be the generic product type, not a specific variant (e.g. brand, flavor, color, etc.)
"""

attribute_selection_prompt = """Product type: {product_type}
Candidate attributes: {candidate_attributes}

Identify 2-4 attributes that may be relevant to someone who wants to buy the product. Ignore attributes that sound like object properties (e.g. color, size, weight, etc.), are not relevant to the product type (e.g. brand, flavor, etc.) or are the nouns that do not sound like actual product attributes. Your output should be a comma-separated list of attributes, without any additional text or punctuation.
"""

option_type_selection_prompt = """Desired product: {product_type}
Set of product options available: {option_types}

For the desired product, identify which options, out of the ones available, a person will absolutely care about when buying that desired product. For example, if the product is a t-shirt, the person will definitely care about the color, size and material, but maybe not the pattern. On the other hand, if the product is a pen drive, the size (capacity) will be important, but the color will not be.
Your output should be a comma-separated list of relevant option types, without any additional text or punctuation.
"""

option_selection_prompt = """Product name: {product_name}
Desired option types: {desired_option_types}
Dictionary of option types to possible option values:  {options_dict}

For each of the desired option types above, and each of the possible option values for that option type, identify which option values are mentioned in the product name. If the product name does not mention an option value, then select a random option value for that option type.

Your output should be a dictionary where the keys are the desired option types, and the values are the selected option values.
"""

scenario_creation_prompt = """Product: {product_type}
Product attributes: {attributes}
Options that the scenario must still be valid for: {goal_options}

Create a scenario where a person would want to buy a product with the listed attributes. 
- It should be clear from the scenario that the person will want to buy that specific product, not something similar but different. You can explicitly mention the product to remove any ambiguity (e.g. "You want to buy a hair straightener for...").
- The scenario should preferably not explicitly mention the attributes, but the constraints described by the scenario should lead a user to want to buy the product with those specific attributes i.e. if a user is presented just this scenario, they will understand that they need to buy the product with those specific attributes. If it is not possible to write a scenario that reliably leads to the product with those attributes without explicitly mentioning the attributes, then you can mention the attributes but try to be creative.
- The scenario can also mention some characteristics of the person that may be relevant in motivating them to buy the product with those attributes (e.g. "You have long thick hair...")
- The scenario should also be valid for the listed options, but it should not explicitly mention them. 
- The scenario should be 1-2 sentences long, and be written in the second person (e.g. "You are a..."). 

Output the scenario, then "REASONING:" and then, for each attribute (not the option values), explain why the user should buy a product with that attribute (in the second person).
"""

price_upper_prompt = """Product: {product_name}
Product price: {product_price}

Given the product name and price, identify the maximum price that a person would be willing to pay for the product (must be higher than the product price). Your output should be a whole number (no decimals, e.g. $5.99), preferably rounded to a multiple of 5 or 10, without any additional text or punctuation.
"""


def generate_scenario(product):
    goal = {}
    goal['asin'] = product['asin']
    goal['name'] = product['name']
    goal['product_category'] = product['product_category']
    goal['query'] = product['query']
    goal['weight'] = 1.0
    goal['category'] = product['category']

    # First identify the type of product that needs to be bought
    input_message = product_identification_prompt.format(
        product_name=product['name'],
        product_category=product['product_category'],
        # query=product['query']
    )
    gpt_response = client.chat.completions.create(
        model="gpt-4.1",
        messages=[
            {"role": "system", "content": "You are a helpful assistant that identifies products from their name, category and search query."},
            {"role": "user", "content": input_message}
        ],
        temperature=0.0,
        max_tokens=100
    )
    product_type = gpt_response.choices[0].message.content.strip()
    goal['product_type'] = product_type
    print(f"Step 1. Identified product type: {product_type}")

    # Then, identify the attributes that are relevant for the product
    input_message = attribute_selection_prompt.format(
        product_type=product_type,
        candidate_attributes=', '.join(product['Attributes'])
    )
    gpt_response = client.chat.completions.create(
        model="gpt-4.1",
        messages=[
            {"role": "system", "content": "You are a helpful assistant that identifies relevant product attributes."},
            {"role": "user", "content": input_message}
        ],
        temperature=0.0,
        max_tokens=100
    )
    sampled_attributes = gpt_response.choices[0].message.content.strip().split(',')
    sampled_attributes = [attribute.strip() for attribute in sampled_attributes if attribute.strip() in product['Attributes']]
    goal['attributes'] = sampled_attributes
    print(f"Step 2. Identified the relevant attributes: {goal['attributes']}")
        

    # Second, select the option types that are relevant for the product
    input_message = option_type_selection_prompt.format(
        product_type=product_type,
        query=product['query'],
        option_types=', '.join(product['options'].keys()),
    )
    gpt_response = client.chat.completions.create(
        model="gpt-4.1",
        messages=[
            {"role": "system", "content": "You are a helpful assistant that identifies relevant product options."},
            {"role": "user", "content": input_message}
        ],
        temperature=0.0,
        max_tokens=100
    )
    sampled_option_types = gpt_response.choices[0].message.content.strip().split(',')
    sampled_option_types = [option_type.strip() for option_type in sampled_option_types if option_type.strip() in product['options']]
    assert len(sampled_option_types) > 0, "No relevant option types found for the product."
    print(f"Step 3. Identified the relevant option types: {sampled_option_types}")

    # Then, identify the option values for those option types
    input_message = option_selection_prompt.format(
        product_name=product['name'],
        desired_option_types=', '.join(sampled_option_types),
        options_dict=json.dumps(product['options'])
    )
    gpt_response = client.chat.completions.create(
        model="gpt-4.1",
        messages=[
            {"role": "system", "content": "You are a helpful assistant that identifies option values for a product."},
            {"role": "user", "content": input_message}
        ],
        temperature=0.0,
        max_tokens=100
    )
    dict_string = gpt_response.choices[0].message.content.strip()
    sampled_options_dict = json.loads(dict_string)
    sampled_options = list(sampled_options_dict.values())
    goal['goal_options'] = sampled_options
    goal['goal_options_dict'] = sampled_options_dict
    print(f"Step 4. Identified the following options: {goal['goal_options']}")

    # Finally, generate the scenario based on the product category, query, attributes and options
    input_message = scenario_creation_prompt.format(
        product_type=product_type,
        attributes=', '.join(goal['attributes']),
        goal_options=', '.join(goal['goal_options']),
    )
    gpt_response = client.chat.completions.create(
        model="gpt-4.1",
        messages=[
            {"role": "system", "content": "You are a helpful assistant that generates scenarios for product goals."},
            {"role": "user", "content": input_message}
        ],
        temperature=1.0,
        max_tokens=300
    )
    scenario = gpt_response.choices[0].message.content.strip()
    reasoning = scenario.split("REASONING:")[1].strip()
    scenario = scenario.split("REASONING:")[0].strip().strip("\n")
    goal['instruction_text'] = scenario
    goal['attribute_wise_reasoning'] = reasoning
    print(f"Step 5. Generated scenario: {goal['instruction_text']}")
    print(f"REASONING: \n{reasoning}")

    input_message = price_upper_prompt.format(
        product_name=product['name'],
        product_price=product['pricing']
    )
    gpt_response = client.chat.completions.create(
        model="gpt-4.1",
        messages=[
            {"role": "system", "content": "You are a helpful assistant that identifies the maximum price that a person would be willing to pay for a product."},
            {"role": "user", "content": input_message}
        ],
        temperature=0.0,
        max_tokens=100
    )
    price_upper = gpt_response.choices[0].message.content.strip()
    goal['price_upper'] = float(price_upper)
    print(f"Step 6. Identified the maximum price that a person would be willing to pay for the product: {goal['price_upper']}")
    return goal



In [286]:
category = 'grocery'
products = category2products[category]

product = random.choice(products)
print("ORIGINAL PRODUCT:")
print_product(product['asin'])
print("-"*100)

ORIGINAL PRODUCT:
ASIN: B01M07RTKT
Name: Fresh Wave Odor Eliminator Spray & Air Freshener, 8 fl. oz, Natural Ingredients (Pack of 2)
Product category: Health & Household › Household Supplies › Air Fresheners › Spray
Relevant search query: air freshener supplies
Price: [14.99]
Attributes: ['natural ingredients', 'plant based', 'non gmo']
Options:
- scent:
	lavender (image: https://m.media-amazon.com/images/I/510xTgxdCaL.jpg)
	original (image: https://m.media-amazon.com/images/I/514rRv4aFxL.jpg)
- size:
	8 fl oz (pack of 1)
	8 fl oz (pack of 2)
	32 fl oz (pack of 1)
----------------------------------------------------------------------------------------------------


In [287]:
print(f"GENERATING SCENARIO:")
goal = generate_scenario(product)
print("-" * 100)

print("GENERATED REALISTIC GOAL:")
pprint(goal)

GENERATING SCENARIO:
Step 1. Identified product type: air freshener spray
Step 2. Identified the relevant attributes: ['natural ingredients', 'plant based', 'non gmo']
Step 3. Identified the relevant option types: ['scent', 'size']
Step 4. Identified the following options: ['original', '8 fl oz (pack of 2)']
Step 5. Generated scenario: Your young child has allergies and you’re looking to freshen your home without exposing them to any artificial chemicals, so you want to buy an air freshener spray that’s safe to use around kids and pets.
REASONING: 
- Natural ingredients: You want to avoid harsh or artificial chemicals that might trigger your child’s allergies or cause irritation, so a natural air freshener is best.
- Plant based: Plant-based products are less likely to contain allergens and synthetic fragrances, making your home safer for children and pets.
- Non-GMO: You are conscious about the overall health effects of products in your home and trust non-GMO products to align with yo

In [288]:
filename = "scenario_data/v3_heldout_50scenarios.json"
if not os.path.exists(filename):
    with open(filename, "w") as f:
        json.dump([], f)

with open(filename, "r") as f:
    scenarios = json.load(f)
scenario_asins = [s['asin'] for s in scenarios]
if goal['asin'] not in scenario_asins:
    num_samecategory_already = len([s for s in scenarios if s['category'] == goal['category']])
    goal['scenario_id'] = f"{goal['category']}{num_samecategory_already:03d}"
    scenarios.append(goal)
    with open(filename, "w") as f:
        json.dump(scenarios, f, indent=4)
    print(f"Wrote {len(scenarios)} scenarios to {filename}")
    print(f"Last scenario ID: {goal['scenario_id']}")
else:
    print(f"Scenario {goal['asin']} already exists in {filename}")


Wrote 50 scenarios to scenario_data/v3_heldout_50scenarios.json
Last scenario ID: grocery009


In [54]:
with open(filename, "r") as f:
    scenarios = json.load(f)
scenarios.sort(key=lambda x: x['scenario_id'])
json.dump(scenarios, open(filename, "w"), indent=4)
for category in ['beauty', 'garden', 'electronics', 'fashion', 'grocery']:
    category_scenarios = [s for s in scenarios if s['category'] == category]
    print(category)
    for s in category_scenarios:
        print(f"- {s['product_type']}")

beauty
- makeup brush set
- lip balm
- hair dye
- deodorant spray
- conditioner
- hair cutting cape
- shower cap
- body scrub
- toothpaste
- massage table sheets
garden
electronics
fashion
grocery


## Scenarios analysis

In [3]:
filename = "data_collection_interface/scenario_data/v2_50scenarios.json"
with open(filename, "r") as f:
    scenarios = json.load(f)

In [None]:
import numpy as np
from tqdm import tqdm

env.server.goals = scenarios
env.server.weights = [goal['weight'] for goal in scenarios]
env.server.cum_weights = [0] + np.cumsum(env.server.weights).tolist()

num_in_search_obs = 0
for i, s in enumerate(tqdm(scenarios)):
    product_asin = s['asin']
    scenario_text = s['instruction_text']

    env.reset(i)
    action = f"search[\"{scenario_text}\"]"
    obs, reward, done, info = env.step(action)
    if product_asin in obs:
        num_in_search_obs += 1

print(f"When using the scenario text as the search query:")
print(f"\tNumber of scenarios where the product is in the search results: {num_in_search_obs}")
print(f"\tPercentage: {num_in_search_obs / len(scenarios):.2%}")


In [None]:
import numpy as np
from tqdm import tqdm

env.server.goals = scenarios
env.server.weights = [goal['weight'] for goal in scenarios]
env.server.cum_weights = [0] + np.cumsum(env.server.weights).tolist()

num_in_search_obs = 0
for i, s in enumerate(tqdm(scenarios)):
    product_asin = s['asin']
    product_type = s['product_type']

    env.reset(i)
    action = f"search[\"{product_type}\"]"
    obs, reward, done, info = env.step(action)
    if product_asin in obs:
        num_in_search_obs += 1

print(f"When using the product type as the search query:")
print(f"\tNumber of scenarios where the product is in the search results: {num_in_search_obs}")
print(f"\tPercentage: {num_in_search_obs / len(scenarios):.2%}")


In [None]:
import numpy as np
from tqdm import tqdm

env.server.goals = scenarios
env.server.weights = [goal['weight'] for goal in scenarios]
env.server.cum_weights = [0] + np.cumsum(env.server.weights).tolist()

num_in_search_obs = 0
for i, s in enumerate(tqdm(scenarios)):
    product_asin = s['asin']
    product_type = s['product_type']
    attributes = ', '.join(s['attributes'])

    env.reset(i)
    action = f"search[\"{product_type + attributes}\"]"
    obs, reward, done, info = env.step(action)
    if product_asin in obs:
        num_in_search_obs += 1

print(f"When using the product type and attributes as the search query:")
print(f"\tNumber of scenarios where the product is in the search results: {num_in_search_obs}")
print(f"\tPercentage: {num_in_search_obs / len(scenarios):.2%}")


In [None]:
# i = random.randint(0, len(scenarios) - 1)
s = scenarios[i]
print(f"Scenario {i}")
print(f"\tProduct type: {s['product_type']}")
print(f"\tAttributes: {s['attributes']}")
print(f"\tGoal options: {s['goal_options_dict']}")
print(f"\tInstruction text: {s['instruction_text']}")

env.reset(i)
query = s['product_type'] + ' ' + ', '.join(s['attributes'])
action = f"search[\"{query}\"]"
obs, reward, done, info = env.step(action)
print(obs)
